From c59e375c7a4dc5f926a7fb4194af69af48792521 Mon Sep 17 00:00:00 2001 From: Yury Krivopalov Date: Wed, 23 Aug 2017 22:12:03 +0300 Subject: [PATCH 1/3] tr: use as_bytes on whole output string On my environment on 1MiB file with this fix tr takes 30ms, instead of 44ms without fix. --- src/tr/tr.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/tr/tr.rs b/src/tr/tr.rs index bc904ea57..bd1b35d31 100644 --- a/src/tr/tr.rs +++ b/src/tr/tr.rs @@ -71,7 +71,7 @@ fn tr<'a>(set1: ExpandSet<'a>, mut set2: ExpandSet<'a>) { let mut locked_stdin = stdin.lock(); let mut buffered_stdout = BufWriter::new(stdout()); let mut buf = String::with_capacity(BUFFER_LEN + 4); - let mut char_output_buffer: [u8; 4] = [0;4]; + let mut output_buf = String::with_capacity(BUFFER_LEN + 4); let mut s2_prev = '_'; for i in set1 { @@ -85,13 +85,12 @@ fn tr<'a>(set1: ExpandSet<'a>, mut set2: ExpandSet<'a>) { { // isolation to make borrow checker happy let output_stream = buf.chars().map(|c| *map.get(&(c as usize)).unwrap_or(&c)); - for c in output_stream { - let char_as_bytes = c.encode_utf8(&mut char_output_buffer); - buffered_stdout.write_all(char_as_bytes.as_bytes()).unwrap(); - } + output_buf.extend(output_stream); + buffered_stdout.write_all(output_buf.as_bytes()).unwrap(); } buf.clear(); + output_buf.clear(); } } From b4d8265a0767f6a971c6b24abd1741de8c2a9ad5 Mon Sep 17 00:00:00 2001 From: Yury Krivopalov Date: Thu, 24 Aug 2017 14:26:08 +0300 Subject: [PATCH 2/3] tr: generaze translation --- src/tr/tr.rs | 122 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 74 insertions(+), 48 deletions(-) diff --git a/src/tr/tr.rs b/src/tr/tr.rs index bd1b35d31..308f36079 100644 --- a/src/tr/tr.rs +++ b/src/tr/tr.rs @@ -32,63 +32,80 @@ static NAME: &'static str = "tr"; static VERSION: &'static str = env!("CARGO_PKG_VERSION"); const BUFFER_LEN: usize = 1024; -fn delete(set: ExpandSet, complement: bool) { - let mut bset = BitSet::new(); - let stdin = stdin(); - let mut locked_stdin = stdin.lock(); - let mut buffered_stdout = BufWriter::new(stdout()); - let mut buf = String::with_capacity(BUFFER_LEN + 4); - let mut char_output_buffer: [u8; 4] = [0;4]; +trait SymbolTranslator { + fn translate(&self, c: &char, prev_c: &char) -> Option; +} - for c in set { - bset.insert(c as usize); - } +struct DeleteOperation { + bset: BitSet, + complement: bool, +} - let is_allowed = |c : char| { - if complement { - bset.contains(c as usize) - } else { - !bset.contains(c as usize) +impl DeleteOperation { + fn new(set: ExpandSet, complement: bool) -> DeleteOperation { + DeleteOperation { + bset: set.map(|c| c as usize).collect(), + complement: complement } - }; - - while let Ok(length) = locked_stdin.read_line(&mut buf) { - if length == 0 { break } - { // isolation to make borrow checker happy - let filtered = buf.chars().filter(|c| is_allowed(*c)); - for c in filtered { - let char_as_bytes = c.encode_utf8(&mut char_output_buffer); - buffered_stdout.write_all(char_as_bytes.as_bytes()).unwrap(); - } - } - buf.clear(); } } -fn tr<'a>(set1: ExpandSet<'a>, mut set2: ExpandSet<'a>) { - let mut map = FnvHashMap::default(); - let stdin = stdin(); - let mut locked_stdin = stdin.lock(); - let mut buffered_stdout = BufWriter::new(stdout()); +impl SymbolTranslator for DeleteOperation { + fn translate(&self, c: &char, _prev_c: &char) -> Option { + let uc = *c as usize; + if self.complement == self.bset.contains(uc) { + Some(*c) + } else { + None + } + } +} + +struct TranslateOperation { + translate_map: FnvHashMap, +} + +impl TranslateOperation { + fn new(set1: ExpandSet, set2: &mut ExpandSet) -> TranslateOperation { + let mut map = FnvHashMap::default(); + let mut s2_prev = '_'; + for i in set1 { + s2_prev = set2.next().unwrap_or(s2_prev); + + map.insert(i as usize, s2_prev); + } + TranslateOperation { + translate_map: map, + } + } +} + +impl SymbolTranslator for TranslateOperation { + fn translate(&self, c: &char, _prev_c: &char) -> Option { + Some(*self.translate_map.get(&(*c as usize)).unwrap_or(c)) + } +} + +fn translate_input(input: &mut BufRead, output: &mut Write, translator: T) { let mut buf = String::with_capacity(BUFFER_LEN + 4); let mut output_buf = String::with_capacity(BUFFER_LEN + 4); + // let mut char_output_buffer: [u8; 4] = [0;4]; - let mut s2_prev = '_'; - for i in set1 { - s2_prev = set2.next().unwrap_or(s2_prev); - - map.insert(i as usize, s2_prev); - } - - while let Ok(length) = locked_stdin.read_line(&mut buf) { + while let Ok(length) = input.read_line(&mut buf) { + let mut prev_c = 0 as char; if length == 0 { break } - { // isolation to make borrow checker happy - let output_stream = buf.chars().map(|c| *map.get(&(c as usize)).unwrap_or(&c)); - output_buf.extend(output_stream); - buffered_stdout.write_all(output_buf.as_bytes()).unwrap(); - } + let filtered = buf.chars().filter_map(|c| { + let res = translator.translate(&c, &prev_c); + if res.is_some() { + prev_c = c; + } + res + }); + output_buf.extend(filtered); + output.write_all(output_buf.as_bytes()).unwrap(); + } buf.clear(); output_buf.clear(); } @@ -110,6 +127,7 @@ pub fn uumain(args: Vec) -> i32 { opts.optflag("C", "", "same as -c"); opts.optflag("d", "delete", "delete characters in SET1"); opts.optflag("h", "help", "display this help and exit"); + opts.optflag("s", "squeeze", ""); opts.optflag("V", "version", "output version information and exit"); let matches = match opts.parse(&args[1..]) { @@ -144,13 +162,21 @@ pub fn uumain(args: Vec) -> i32 { return 1; } + let stdin = stdin(); + let mut locked_stdin = stdin.lock(); + let stdout = stdout(); + let locked_stdout = stdout.lock(); + let mut buffered_stdout = BufWriter::new(locked_stdout); + if dflag { let set1 = ExpandSet::new(sets[0].as_ref()); - delete(set1, cflag); + let delete_op = DeleteOperation::new(set1, cflag); + translate_input(&mut locked_stdin, &mut buffered_stdout, delete_op); } else { let set1 = ExpandSet::new(sets[0].as_ref()); - let set2 = ExpandSet::new(sets[1].as_ref()); - tr(set1, set2); + let mut set2 = ExpandSet::new(sets[1].as_ref()); + let op = TranslateOperation::new(set1, &mut set2); + translate_input(&mut locked_stdin, &mut buffered_stdout, op) } 0 From ac375d8b7d7655a8a2f1278ad8f971ad256482bf Mon Sep 17 00:00:00 2001 From: Yury Krivopalov Date: Sat, 26 Aug 2017 15:26:24 +0300 Subject: [PATCH 3/3] tr: add squeeze option --- src/tr/tr.rs | 74 ++++++++++++++++++++++++++++++++++++++++++------ tests/test_tr.rs | 25 ++++++++++++++++ 2 files changed, 91 insertions(+), 8 deletions(-) diff --git a/src/tr/tr.rs b/src/tr/tr.rs index 308f36079..4167234a5 100644 --- a/src/tr/tr.rs +++ b/src/tr/tr.rs @@ -61,6 +61,56 @@ impl SymbolTranslator for DeleteOperation { } } +struct SqueezeOperation { + squeeze_set: BitSet, + complement: bool, +} + +impl SqueezeOperation { + fn new(squeeze_set: ExpandSet, complement: bool) -> SqueezeOperation { + SqueezeOperation { + squeeze_set: squeeze_set.map(|c| c as usize).collect(), + complement: complement + } + } +} + +impl SymbolTranslator for SqueezeOperation { + fn translate(&self, c: &char, prev_c: &char) -> Option { + if *prev_c == *c && self.complement != self.squeeze_set.contains(*c as usize) { + None + } else { + Some(*c) + } + } +} + +struct DeleteAndSqueezeOperation { + delete_set: BitSet, + squeeze_set: BitSet, + complement: bool, +} + +impl DeleteAndSqueezeOperation { + fn new(delete_set: ExpandSet, squeeze_set: ExpandSet, complement: bool) -> DeleteAndSqueezeOperation { + DeleteAndSqueezeOperation { + delete_set: delete_set.map(|c| c as usize).collect(), + squeeze_set: squeeze_set.map(|c| c as usize).collect(), + complement: complement + } + } +} + +impl SymbolTranslator for DeleteAndSqueezeOperation { + fn translate(&self, c: &char, prev_c: &char) -> Option { + if self.complement != self.delete_set.contains(*c as usize) || *prev_c == *c && self.squeeze_set.contains(*c as usize) { + None + } else { + Some(*c) + } + } +} + struct TranslateOperation { translate_map: FnvHashMap, } @@ -89,7 +139,6 @@ impl SymbolTranslator for TranslateOperation { fn translate_input(input: &mut BufRead, output: &mut Write, translator: T) { let mut buf = String::with_capacity(BUFFER_LEN + 4); let mut output_buf = String::with_capacity(BUFFER_LEN + 4); - // let mut char_output_buffer: [u8; 4] = [0;4]; while let Ok(length) = input.read_line(&mut buf) { let mut prev_c = 0 as char; @@ -127,7 +176,7 @@ pub fn uumain(args: Vec) -> i32 { opts.optflag("C", "", "same as -c"); opts.optflag("d", "delete", "delete characters in SET1"); opts.optflag("h", "help", "display this help and exit"); - opts.optflag("s", "squeeze", ""); + opts.optflag("s", "squeeze", "replace each sequence of a repeated character that is listed in the last specified SET, with a single occurrence of that character"); opts.optflag("V", "version", "output version information and exit"); let matches = match opts.parse(&args[1..]) { @@ -155,10 +204,11 @@ pub fn uumain(args: Vec) -> i32 { let dflag = matches.opt_present("d"); let cflag = matches.opts_present(&["c".to_owned(), "C".to_owned()]); + let sflag = matches.opt_present("s"); let sets = matches.free; - if cflag && !dflag { - show_error!("-c is only supported with -d"); + if cflag && !dflag && !sflag { + show_error!("-c is only supported with -d or -s"); return 1; } @@ -168,12 +218,20 @@ pub fn uumain(args: Vec) -> i32 { let locked_stdout = stdout.lock(); let mut buffered_stdout = BufWriter::new(locked_stdout); + let set1 = ExpandSet::new(sets[0].as_ref()); if dflag { - let set1 = ExpandSet::new(sets[0].as_ref()); - let delete_op = DeleteOperation::new(set1, cflag); - translate_input(&mut locked_stdin, &mut buffered_stdout, delete_op); + if sflag { + let set2 = ExpandSet::new(sets[1].as_ref()); + let op = DeleteAndSqueezeOperation::new(set1, set2, cflag); + translate_input(&mut locked_stdin, &mut buffered_stdout, op); + } else { + let op = DeleteOperation::new(set1, cflag); + translate_input(&mut locked_stdin, &mut buffered_stdout, op); + } + } else if sflag { + let op = SqueezeOperation::new(set1, cflag); + translate_input(&mut locked_stdin, &mut buffered_stdout, op); } else { - let set1 = ExpandSet::new(sets[0].as_ref()); let mut set2 = ExpandSet::new(sets[1].as_ref()); let op = TranslateOperation::new(set1, &mut set2); translate_input(&mut locked_stdin, &mut buffered_stdout, op) diff --git a/tests/test_tr.rs b/tests/test_tr.rs index 48a38d7a0..4e4c99bb1 100644 --- a/tests/test_tr.rs +++ b/tests/test_tr.rs @@ -32,3 +32,28 @@ fn test_delete_complement() { new_ucmd!() .args(&["-d", "-c", "a-z"]).pipe_in("aBcD").run().stdout_is("ac"); } + +#[test] +fn test_squeeze() { + new_ucmd!() + .args(&["-s", "a-z"]).pipe_in("aaBBcDcc").run().stdout_is("aBBcDc"); +} + + +#[test] +fn test_squeeze_complement() { + new_ucmd!() + .args(&["-sc", "a-z"]).pipe_in("aaBBcDcc").run().stdout_is("aaBcDcc"); +} + +#[test] +fn test_delete_and_squeeze() { + new_ucmd!() + .args(&["-ds", "a-z", "A-Z"]).pipe_in("abBcB").run().stdout_is("B"); +} + +#[test] +fn test_delete_and_squeeze_complement() { + new_ucmd!() + .args(&["-dsc", "a-z", "A-Z"]).pipe_in("abBcB").run().stdout_is("abc"); +}