fix tr and its test

In addition, this commit substantially reduces the number of allocations that tr does when building the substitution tables.
2025-08-01 21:47:46 +00:00 · 2015-04-28 21:16:11 -04:00 · 2015-04-28 21:16:11 -04:00 · 91827a594a
commit 91827a594a
parent 8e2788bd39
3 changed files with 174 additions and 111 deletions
--- a/src/tr/expand.rs
+++ b/src/tr/expand.rs
@ -0,0 +1,117 @@
+/*
+ * This file is part of the uutils coreutils package.
+ *
+ * (c) Michael Gehring <mg@ebfe.org>
+ * (c) kwantam <kwantam@gmail.com>
+ *     20150428 created `expand` module to eliminate most allocs during setup
+ *
+ * For the full copyright and license information, please view the LICENSE
+ * file that was distributed with this source code.
+ */
+
+use std::char::from_u32;
+use std::cmp::min;
+use std::iter::Peekable;
+use std::ops::Range;
+
+#[inline]
+fn unescape_char(c: char) -> char {
+    match c {
+        'a' => 0x07u8 as char,
+        'b' => 0x08u8 as char,
+        'f' => 0x0cu8 as char,
+        'v' => 0x0bu8 as char,
+        'n' => '\n',
+        'r' => '\r',
+        't' => '\t',
+        _ => c,
+    }
+}
+
+struct Unescape<'a> {
+    string: &'a str,
+}
+
+impl<'a> Iterator for Unescape<'a> {
+    type Item = char;
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let slen = self.string.len();
+        (min(slen, 1), None)
+    }
+
+    #[inline]
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.string.len() == 0 {
+            return None;
+        }
+
+        // is the next character an escape?
+        let (ret, idx) = match self.string.chars().next().unwrap() {
+            '\\' if self.string.len() > 1 => {
+                // yes---it's \ and it's not the last char in a string
+                // we know that \ is 1 byte long so we can index into the string safely
+                let c = self.string[1..].chars().next().unwrap();
+                (Some(unescape_char(c)), 1 + c.len_utf8())
+            },
+            c => (Some(c), c.len_utf8()),   // not an escape char
+        };
+
+        self.string = &self.string[idx..];              // advance the pointer to the next char
+        ret
+    }
+}
+
+pub struct ExpandSet<'a> {
+    range: Range<u32>,
+    unesc: Peekable<Unescape<'a>>,
+}
+
+impl<'a> Iterator for ExpandSet<'a> {
+    type Item = char;
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.unesc.size_hint()
+    }
+
+    #[inline]
+    fn next(&mut self) -> Option<Self::Item> {
+        // while the Range has elements, try to return chars from it
+        // but make sure that they actually turn out to be Chars!
+        while let Some(n) = self.range.next() {
+            match from_u32(n) {
+                Some(c) => return Some(c),
+                _ => (),
+            }
+        }
+
+        if let Some(first) = self.unesc.next() {
+            // peek ahead
+            if self.unesc.peek() == Some(&'-') && match self.unesc.size_hint() {
+                (x, _) if x > 1 => true,    // there's a range here; record it in our internal Range struct
+                _ => false,
+            } {
+                self.unesc.next();                      // this is the '-'
+                let last = self.unesc.next().unwrap();  // this is the end of the range
+
+                self.range = first as u32 + 1 .. last as u32 + 1;
+            }
+
+            return Some(first);     // in any case, return the next char
+        }
+
+        None
+    }
+}
+
+impl<'a> ExpandSet<'a> {
+    #[inline]
+    pub fn new(s: &'a str) -> ExpandSet<'a> {
+        ExpandSet {
+            range: 0 .. 0,
+            unesc: Unescape { string: s }.peekable(),
+        }
+    }
+}
--- a/src/tr/tr.rs
+++ b/src/tr/tr.rs
@ -1,98 +1,40 @@
 #![crate_name = "tr"]
-#![feature(collections, core, old_io, rustc_private)]
+#![feature(io, rustc_private)]

 /*
 * This file is part of the uutils coreutils package.
 *
 * (c) Michael Gehring <mg@ebfe.org>
+ * (c) kwantam <kwantam@gmail.com>
+ *     20150428 created `expand` module to eliminate most allocs during setup
 *
 * For the full copyright and license information, please view the LICENSE
 * file that was distributed with this source code.
 */

-extern crate collections;
 extern crate getopts;

 use getopts::OptGroup;
-use std::char::from_u32;
 use std::collections::{BitSet, VecMap};
-use std::old_io::{BufferedReader, print};
-use std::old_io::stdio::{stdin_raw, stdout};
-use std::iter::FromIterator;
-use std::vec::Vec;
+use std::io::{stdin, stdout, BufReader, Read, Write};
+use expand::ExpandSet;

 #[path="../common/util.rs"]
 #[macro_use]
 mod util;

+mod expand;
+
 static NAME : &'static str = "tr";
 static VERSION : &'static str = "1.0.0";
+const BUFFER_LEN: usize = 1024;

-#[inline]
-fn unescape_char(c: char) -> char {
-    match c {
-        'a' => 0x07u8 as char,
-        'b' => 0x08u8 as char,
-        'f' => 0x0cu8 as char,
-        'v' => 0x0bu8 as char,
-        'n' => '\n',
-        'r' => '\r',
-        't' => '\t',
-        _ => c,
-    }
-}
-
-#[inline]
-fn unescape(v: Vec<char>) -> Vec<char> {
-    let mut out = Vec::new();
-    let mut input = v.as_slice();
-    loop {
-        input = match input {
-            ['\\', e, rest..] => {
-                out.push(unescape_char(e));
-                rest
-            }
-            [c, rest..] => {
-                out.push(c);
-                rest
-            }
-            [] => break
-        }
-    }
-    out
-}
-
-#[inline]
-fn expand_range(from: char, to: char) -> Vec<char> {
-    range(from as u32, to as u32 + 1).map(|c| from_u32(c).unwrap()).collect()
-}
-
-fn expand_set(s: &str) -> Vec<char> {
-    let mut set = Vec::<char>::new();
-    let unesc = unescape(FromIterator::from_iter(s.chars()));
-    let mut input = unesc.as_slice();
-
-    loop {
-        input = match input {
-            [f, '-', t, rest..] => {
-                set.push_all(expand_range(f, t).as_slice());
-                rest
-            }
-            [c, rest..] => {
-                set.push(c);
-                rest
-            }
-            [] => break
-        };
-    }
-    set
-}
-
-fn delete(set: Vec<char>, complement: bool) {
+fn delete<'a>(set: ExpandSet<'a>, complement: bool) {
    let mut bset = BitSet::new();
-    let mut out = stdout();
+    let mut stdout = stdout();
+    let mut buf = String::with_capacity(BUFFER_LEN + 4);

-    for &c in set.iter() {
+    for c in set {
        bset.insert(c as usize);
    }

@ -104,42 +46,44 @@ fn delete(set: Vec<char>, complement: bool) {
        }
    };

-    for c in BufferedReader::new(stdin_raw()).chars() {
+    for c in BufReader::new(stdin()).chars() {
        match c {
-            Ok(c) if is_allowed(c) => out.write_char(c).unwrap(),
+            Ok(c) if is_allowed(c) => buf.push(c),
            Ok(_) => (),
            Err(err) => panic!("{}", err),
        };
+        if buf.len() >= BUFFER_LEN {
+            safe_unwrap!(stdout.write_all(&buf[..].as_bytes()));
+        }
+    }
+    if buf.len() > 0 {
+        safe_unwrap!(stdout.write_all(&buf[..].as_bytes()));
    }
 }

-fn tr(set1: &[char], set2: &[char]) {
-    const BUFFER_LEN: usize = 1024;
-
+fn tr<'a>(set1: ExpandSet<'a>, mut set2: ExpandSet<'a>) {
    let mut map = VecMap::new();
    let mut stdout = stdout();
-    let mut outbuffer = String::with_capacity(BUFFER_LEN);
+    let mut buf = String::with_capacity(BUFFER_LEN + 4);

-    let set2_len = set2.len();
-    for i in range(0, set1.len()) {
-        if i >= set2_len {
-            map.insert(set1[i] as usize, set2[set2_len - 1]);
-        } else {
-            map.insert(set1[i] as usize, set2[i]);
-        }
+    let mut s2_prev = '_';
+    for i in set1 {
+        s2_prev = set2.next().unwrap_or(s2_prev);
+
+        map.insert(i as usize, s2_prev);
    }

-    for c in BufferedReader::new(stdin_raw()).chars() {
+    for c in BufReader::new(stdin()).chars() {
        match c {
            Ok(inc) => {
                let trc = match map.get(&(inc as usize)) {
                    Some(t) => *t,
                    None => inc,
                };
-                outbuffer.push(trc);
-                if outbuffer.len() >= BUFFER_LEN {
-                    stdout.write_str(outbuffer.as_slice()).unwrap();
-                    outbuffer.clear();
+                buf.push(trc);
+                if buf.len() >= BUFFER_LEN {
+                    safe_unwrap!(stdout.write_all(&buf[..].as_bytes()));
+                    buf.truncate(0);
                }
            }
            Err(err) => {
@ -147,8 +91,8 @@ fn tr(set1: &[char], set2: &[char]) {
            }
        }
    }
-    if outbuffer.len() > 0 {
-        stdout.write_str(outbuffer.as_slice()).unwrap();
+    if buf.len() > 0 {
+        safe_unwrap!(stdout.write_all(&buf[..].as_bytes()));
    }
 }

@ -158,7 +102,7 @@ fn usage(opts: &[OptGroup]) {
    println!("Usage:");
    println!("  {} [OPTIONS] SET1 [SET2]", NAME);
    println!("");
-    print(getopts::usage("Translate or delete characters.", opts).as_slice());
+    println!("{}", getopts::usage("Translate or delete characters.", opts));
 }

 pub fn uumain(args: Vec<String>) -> i32 {
@ -170,7 +114,7 @@ pub fn uumain(args: Vec<String>) -> i32 {
        getopts::optflag("V", "version", "output version information and exit"),
    ];

-    let matches = match getopts::getopts(args.tail(), &opts) {
+    let matches = match getopts::getopts(&args[1..], &opts) {
        Ok(m) => m,
        Err(err) => {
            show_error!("{}", err);
@ -203,12 +147,12 @@ pub fn uumain(args: Vec<String>) -> i32 {
    }

    if dflag {
-        let set1 = expand_set(sets[0].as_slice());
+        let set1 = ExpandSet::new(sets[0].as_ref());
        delete(set1, cflag);
    } else {
-        let set1 = expand_set(sets[0].as_slice());
-        let set2 = expand_set(sets[1].as_slice());
-        tr(set1.as_slice(), set2.as_slice());
+        let set1 = ExpandSet::new(sets[0].as_ref());
+        let set2 = ExpandSet::new(sets[1].as_ref());
+        tr(set1, set2);
    }

    0
--- a/test/tr.rs
+++ b/test/tr.rs
@ -1,49 +1,51 @@
-#![allow(unstable)]
-
-use std::old_io::process::Command;
+use std::io::Write;
+use std::process::{Command, Stdio};

 static PROGNAME: &'static str = "./tr";

 fn run(input: &str, args: &[&'static str]) -> Vec<u8> {
-    let mut process = Command::new(PROGNAME).args(args).spawn().unwrap();
+    let mut process = Command::new(PROGNAME)
+                                   .args(args)
+                                   .stdin(Stdio::piped())
+                                   .stdout(Stdio::piped())
+                                   .spawn()
+                                   .unwrap_or_else(|e| panic!("{}", e));

-    process.stdin.take().unwrap().write_str(input).unwrap();
+    process.stdin.take().unwrap_or_else(|| panic!("Could not take child process stdin"))
+        .write_all(input.as_bytes()).unwrap_or_else(|e| panic!("{}", e));

-    let po = match process.wait_with_output() {
-        Ok(p) => p,
-        Err(err) => panic!("{}", err),
-    };
-    po.output
+    let po = process.wait_with_output().unwrap_or_else(|e| panic!("{}", e));
+    po.stdout
 }

 #[test]
 fn test_toupper() {
    let out = run("!abcd!", &["a-z", "A-Z"]);
-    assert_eq!(out.as_slice(), b"!ABCD!");
+    assert_eq!(&out[..], b"!ABCD!");
 }

 #[test]
 fn test_small_set2() {
    let out = run("@0123456789", &["0-9", "X"]);
-    assert_eq!(out.as_slice(), b"@XXXXXXXXXX");
+    assert_eq!(&out[..], b"@XXXXXXXXXX");
 }

 #[test]
 fn test_unicode() {
    let out = run("(,°□°）, ┬─┬", &[", ┬─┬", "╯︵┻━┻"]);
-    assert_eq!(out.as_slice(), "(╯°□°）╯︵┻━┻".as_bytes());
+    assert_eq!(&out[..], "(╯°□°）╯︵┻━┻".as_bytes());
 }

 #[test]
 fn test_delete() {
    let out = run("aBcD", &["-d", "a-z"]);
-    assert_eq!(out.as_slice(), b"BD");
+    assert_eq!(&out[..], b"BD");
 }

 #[test]
 fn test_delete_complement() {
    let out = run("aBcD", &["-d", "-c", "a-z"]);
-    assert_eq!(out.as_slice(), b"ac");
+    assert_eq!(&out[..], b"ac");
 }