diff --git a/Cargo.lock b/Cargo.lock index 8cf7cddcb..aebe260c6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -119,9 +119,9 @@ checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" [[package]] name = "bitflags" -version = "1.2.1" +version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" +checksum = "2da1976d75adbe5fbc88130ecd119529cf1cc6a93ae1546d8696ee66f0d21af1" [[package]] name = "bitvec" @@ -200,7 +200,7 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "db507a7679252d2276ed0dd8113c6875ec56d3089f9225b2b42c30cc1f8e5c89" dependencies = [ - "nom", + "nom 6.1.2", ] [[package]] @@ -645,9 +645,9 @@ checksum = "0e25ea47919b1560c4e3b7fe0aaab9becf5b84a10325ddf7db0f0ba5e1026499" [[package]] name = "digest" -version = "0.6.2" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5b29bf156f3f4b3c4f610a25ff69370616ae6e0657d416de22645483e72af0a" +checksum = "ecae1c064e29fcabb6c2e9939e53dc7da72ed90234ae36ebfe03a478742efbd1" dependencies = [ "generic-array", ] @@ -937,6 +937,19 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" +[[package]] +name = "lexical-core" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6607c62aa161d23d17a9072cc5da0be67cdfc89d3afb1e8d9c842bebc2525ffe" +dependencies = [ + "arrayvec", + "bitflags", + "cfg-if 1.0.0", + "ryu", + "static_assertions", +] + [[package]] name = "libc" version = "0.2.85" @@ -1084,6 +1097,17 @@ version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb" +[[package]] +name = "nom" +version = "5.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffb4262d26ed83a1c0a33a38fe2bb15797329c85770da05e6b828ddb782627af" +dependencies = [ + "lexical-core", + "memchr 2.4.0", + "version_check", +] + [[package]] name = "nom" version = "6.1.2" @@ -1614,6 +1638,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +[[package]] +name = "ryu" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" + [[package]] name = "same-file" version = "1.0.6" @@ -1754,6 +1784,12 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "strsim" version = "0.8.0" @@ -2910,6 +2946,7 @@ dependencies = [ "bit-set", "clap", "fnv", + "nom 5.1.2", "uucore", "uucore_procs", ] diff --git a/src/uu/hashsum/Cargo.toml b/src/uu/hashsum/Cargo.toml index 43d78119b..4b541ec81 100644 --- a/src/uu/hashsum/Cargo.toml +++ b/src/uu/hashsum/Cargo.toml @@ -15,7 +15,7 @@ edition = "2018" path = "src/hashsum.rs" [dependencies] -digest = "0.6.2" +digest = "0.6.1" clap = { version = "2.33", features = ["wrap_help"] } hex = "0.2.0" libc = "0.2.42" diff --git a/src/uu/tr/Cargo.toml b/src/uu/tr/Cargo.toml index f75a540ee..13a1616d4 100644 --- a/src/uu/tr/Cargo.toml +++ b/src/uu/tr/Cargo.toml @@ -20,6 +20,7 @@ fnv = "1.0.5" clap = { version = "2.33", features = ["wrap_help"] } uucore = { version=">=0.0.9", package="uucore", path="../../uucore" } uucore_procs = { version=">=0.0.6", package="uucore_procs", path="../../uucore_procs" } +nom = "5.1.2" [[bin]] name = "tr" diff --git a/src/uu/tr/src/operation.rs b/src/uu/tr/src/operation.rs new file mode 100644 index 000000000..d8ed30c54 --- /dev/null +++ b/src/uu/tr/src/operation.rs @@ -0,0 +1,409 @@ +use nom::{ + branch::alt, + bytes::complete::{tag, take, take_until}, + character::complete::one_of, + multi::many0, + sequence::{separated_pair, tuple}, + IResult, +}; +use std::{ + collections::HashMap, + io::{BufRead, Write}, +}; + +#[derive(Debug, PartialEq, Eq, Clone)] +pub enum Sequence { + Char(char), + CharRange(Vec), +} + +impl Sequence { + pub fn parse_set_string(input: &str) -> Vec { + many0(alt(( + alt(( + Sequence::parse_octal, + Sequence::parse_backslash, + Sequence::parse_audible_bel, + Sequence::parse_backspace, + Sequence::parse_form_feed, + Sequence::parse_newline, + Sequence::parse_return, + Sequence::parse_horizontal_tab, + Sequence::parse_vertical_tab, + )), + alt(( + Sequence::parse_char_range, + Sequence::parse_char_star, + Sequence::parse_char_repeat, + )), + alt(( + Sequence::parse_alnum, + Sequence::parse_alpha, + Sequence::parse_blank, + Sequence::parse_control, + Sequence::parse_digit, + Sequence::parse_graph, + Sequence::parse_lower, + Sequence::parse_print, + Sequence::parse_punct, + Sequence::parse_space, + Sequence::parse_space, + Sequence::parse_upper, + Sequence::parse_xdigit, + Sequence::parse_char_equal, + Sequence::parse_char, + )), + )))(input) + .map(|(_, r)| r) + .unwrap() + } + + pub fn dissolve(self) -> Vec { + match self { + Sequence::Char(c) => vec![c], + Sequence::CharRange(r) => r, + } + } + + /// Sequence parsers + + fn parse_char(input: &str) -> IResult<&str, Sequence> { + take(1usize)(input).map(|(l, r)| (l, Sequence::Char(r.chars().next().unwrap()))) + } + + fn parse_octal(input: &str) -> IResult<&str, Sequence> { + tuple(( + tag("\\"), + one_of("01234567"), + one_of("01234567"), + one_of("01234567"), + ))(input) + .map(|(l, (_, a, b, c))| { + ( + l, + Sequence::Char( + // SAFETY: All the values from \000 to \777 is valid based on a test below... + std::char::from_u32( + a.to_digit(8).unwrap() * 8 * 8 + + b.to_digit(8).unwrap() * 8 + + c.to_digit(8).unwrap(), + ) + .unwrap(), + ), + ) + }) + } + + fn parse_backslash(input: &str) -> IResult<&str, Sequence> { + tuple((tag("\\"), tag("\\")))(input).map(|(l, _)| (l, Sequence::Char('\\'))) + } + + fn parse_audible_bel(input: &str) -> IResult<&str, Sequence> { + tuple((tag("\\"), tag("a")))(input).map(|(l, _)| (l, Sequence::Char('\u{0007}'))) + } + + fn parse_backspace(input: &str) -> IResult<&str, Sequence> { + tuple((tag("\\"), tag("b")))(input).map(|(l, _)| (l, Sequence::Char('\u{0008}'))) + } + + fn parse_form_feed(input: &str) -> IResult<&str, Sequence> { + tuple((tag("\\"), tag("f")))(input).map(|(l, _)| (l, Sequence::Char('\u{000C}'))) + } + + fn parse_newline(input: &str) -> IResult<&str, Sequence> { + tuple((tag("\\"), tag("n")))(input).map(|(l, _)| (l, Sequence::Char('\u{000A}'))) + } + + fn parse_return(input: &str) -> IResult<&str, Sequence> { + tuple((tag("\\"), tag("r")))(input).map(|(l, _)| (l, Sequence::Char('\u{000D}'))) + } + + fn parse_horizontal_tab(input: &str) -> IResult<&str, Sequence> { + tuple((tag("\\"), tag("t")))(input).map(|(l, _)| (l, Sequence::Char('\u{0009}'))) + } + + fn parse_vertical_tab(input: &str) -> IResult<&str, Sequence> { + tuple((tag("\\"), tag("v")))(input).map(|(l, _)| (l, Sequence::Char('\u{000B}'))) + } + + fn parse_char_range(input: &str) -> IResult<&str, Sequence> { + separated_pair(take(1usize), tag("-"), take(1usize))(input).map(|(l, (a, b))| { + (l, { + let (start, end) = ( + u32::from(a.chars().next().unwrap()), + u32::from(b.chars().next().unwrap()), + ); + if (start >= 97 && start <= 122 && end >= 97 && end <= 122 && end > start) + || (start >= 65 && start <= 90 && end >= 65 && end <= 90 && end > start) + || (start >= 48 && start <= 57 && end >= 48 && end <= 57 && end > start) + { + Sequence::CharRange( + (start..=end) + .map(|c| std::char::from_u32(c).unwrap()) + .collect(), + ) + } else { + // This part is unchecked...not all `u32` => `char` is valid + Sequence::CharRange( + (start..=end) + .map(|c| std::char::from_u32(c).unwrap()) + .collect(), + ) + } + }) + }) + } + + fn parse_char_star(input: &str) -> IResult<&str, Sequence> { + tuple((tag("["), take(1usize), tag("*"), tag("]")))(input).map(|(_, (_, _, _, _))| todo!()) + } + + fn parse_char_repeat(input: &str) -> IResult<&str, Sequence> { + tuple((tag("["), take(1usize), tag("*"), take_until("]"), tag("]")))(input).map( + |(l, (_, c, _, n, _))| { + ( + l, + Sequence::CharRange( + std::iter::repeat(c.chars().next().unwrap()) + .take(n.parse().unwrap()) + .collect(), + ), + ) + }, + ) + } + + fn parse_alnum(input: &str) -> IResult<&str, Sequence> { + tag("[:alnum:]")(input).map(|(l, _)| { + ( + l, + Sequence::CharRange(('a'..='z').chain('A'..'Z').chain('0'..'9').collect()), + ) + }) + } + + fn parse_alpha(input: &str) -> IResult<&str, Sequence> { + tag("[:alpha:]")(input).map(|(l, _)| { + ( + l, + Sequence::CharRange(('a'..='z').chain('A'..'Z').collect()), + ) + }) + } + + fn parse_blank(input: &str) -> IResult<&str, Sequence> { + tag("[:blank:]")(input).map(|(_, _)| todo!()) + } + + fn parse_control(input: &str) -> IResult<&str, Sequence> { + tag("[:cntrl:]")(input).map(|(_, _)| todo!()) + } + + fn parse_digit(input: &str) -> IResult<&str, Sequence> { + tag("[:digit:]")(input).map(|(l, _)| (l, Sequence::CharRange(('0'..='9').collect()))) + } + + fn parse_graph(input: &str) -> IResult<&str, Sequence> { + tag("[:graph:]")(input).map(|(_, _)| todo!()) + } + + fn parse_lower(input: &str) -> IResult<&str, Sequence> { + tag("[:lower:]")(input).map(|(_, _)| todo!()) + } + + fn parse_print(input: &str) -> IResult<&str, Sequence> { + tag("[:print:]")(input).map(|(_, _)| todo!()) + } + + fn parse_punct(input: &str) -> IResult<&str, Sequence> { + tag("[:punct:]")(input).map(|(_, _)| todo!()) + } + + fn parse_space(input: &str) -> IResult<&str, Sequence> { + tag("[:space:]")(input).map(|(_, _)| todo!()) + } + + fn parse_upper(input: &str) -> IResult<&str, Sequence> { + tag("[:upper:]")(input).map(|(l, _)| (l, Sequence::CharRange(('A'..='Z').collect()))) + } + + fn parse_xdigit(input: &str) -> IResult<&str, Sequence> { + tag("[:xdigit:]")(input).map(|(_, _)| todo!()) + } + + fn parse_char_equal(input: &str) -> IResult<&str, Sequence> { + tuple((tag("[="), take(1usize), tag("=]")))(input).map(|(_, (_, _, _))| todo!()) + } +} + +pub trait SymbolTranslatorNew { + fn translate(&mut self, current: char) -> Option; +} + +#[derive(Debug, Clone)] +pub struct DeleteOperationNew { + set: Vec, + complement_flag: bool, +} + +impl DeleteOperationNew { + pub fn new(set: Vec, complement_flag: bool) -> DeleteOperationNew { + DeleteOperationNew { + set, + complement_flag, + } + } +} + +impl SymbolTranslatorNew for DeleteOperationNew { + fn translate(&mut self, current: char) -> Option { + let found = self.set.iter().any(|sequence| match sequence { + Sequence::Char(c) => c.eq(¤t), + Sequence::CharRange(r) => r.iter().any(|c| c.eq(¤t)), + }); + (self.complement_flag == found).then(|| current) + } +} + +#[derive(Debug, Clone)] +pub enum TranslateOperationNew { + Standard(HashMap), + Complement(Vec, Vec, HashMap, char), +} + +impl TranslateOperationNew { + pub fn new( + set1: Vec, + mut set2: Vec, + truncate_set2: bool, + complement: bool, + ) -> TranslateOperationNew { + let fallback = set2.last().cloned().unwrap(); + if truncate_set2 { + set2.truncate(set1.len()); + } + if complement { + TranslateOperationNew::Complement( + set1.into_iter() + .flat_map(Sequence::dissolve) + .rev() + .collect(), + set2.into_iter() + .flat_map(Sequence::dissolve) + .rev() + .collect(), + HashMap::new(), + // TODO: Check how `tr` actually handles this + fallback.dissolve().first().cloned().unwrap(), + ) + } else { + TranslateOperationNew::Standard( + set1.into_iter() + .flat_map(Sequence::dissolve) + .zip( + set2.into_iter() + .chain(std::iter::repeat(fallback)) + .flat_map(Sequence::dissolve), + ) + .collect::>(), + ) + } + } +} + +impl SymbolTranslatorNew for TranslateOperationNew { + fn translate(&mut self, current: char) -> Option { + match self { + TranslateOperationNew::Standard(map) => Some( + map.iter() + .find_map(|(l, r)| l.eq(¤t).then(|| *r)) + .unwrap_or(current), + ), + TranslateOperationNew::Complement(set1, set2, mapped_characters, fallback) => { + // First, see if we have already mapped this character. + // If so, return it. + // Else, check if current character is part of set1 + // If so, return it. + // Else, consume from set2, create the translation pair, and return the mapped character + match mapped_characters.get(¤t) { + Some(k) => Some(*k), + None => match set1.iter().any(|c| c.eq(&¤t)) { + true => Some(current), + false => { + let popped = set2.pop().unwrap_or(*fallback); + mapped_characters.insert(current, popped); + Some(popped) + } + }, + } + } + } + } +} + +pub fn translate_input_new(input: &mut dyn BufRead, output: &mut dyn Write, mut translator: T) +where + T: SymbolTranslatorNew, +{ + let mut buf = String::new(); + let mut output_buf = String::new(); + while let Ok(length) = input.read_line(&mut buf) { + if length == 0 { + break; + } else { + let filtered = buf.chars().filter_map(|c| translator.translate(c)); + output_buf.extend(filtered); + output.write_all(output_buf.as_bytes()).unwrap(); + } + buf.clear(); + output_buf.clear(); + } +} + +#[test] +fn test_parse_char_range() { + assert_eq!(Sequence::parse_set_string(""), vec![]); + assert_eq!( + Sequence::parse_set_string("a-z"), + vec![Sequence::CharRange(vec![ + 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', + 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', + ])] + ); + assert_eq!( + Sequence::parse_set_string("a-zA-Z"), + vec![ + Sequence::CharRange(vec![ + 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', + 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', + ]), + Sequence::CharRange(vec![ + 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', + 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', + ]) + ] + ); + assert_eq!( + Sequence::parse_set_string(", ┬─┬"), + vec![ + Sequence::Char(','), + Sequence::Char(' '), + Sequence::Char('┬'), + Sequence::Char('─'), + Sequence::Char('┬') + ] + ); +} + +#[test] +fn test_parse_octal() { + for a in '0'..='7' { + for b in '0'..='7' { + for c in '0'..='7' { + assert!( + Sequence::parse_set_string(format!("\\{}{}{}", a, b, c).as_str()).len() == 1 + ); + } + } + } +} diff --git a/src/uu/tr/src/tr.rs b/src/uu/tr/src/tr.rs index 6dd81badf..1e7236d6e 100644 --- a/src/uu/tr/src/tr.rs +++ b/src/uu/tr/src/tr.rs @@ -12,15 +12,18 @@ #[macro_use] extern crate uucore; +extern crate nom; mod expand; +mod operation; use bit_set::BitSet; use clap::{crate_version, App, Arg}; use fnv::FnvHashMap; +use operation::{translate_input_new, Sequence, TranslateOperationNew}; use std::io::{stdin, stdout, BufRead, BufWriter, Write}; -use crate::expand::ExpandSet; +use crate::{expand::ExpandSet, operation::DeleteOperationNew}; use uucore::InvalidEncodingHandling; static ABOUT: &str = "translate or delete characters"; @@ -31,7 +34,7 @@ mod options { pub const COMPLEMENT: &str = "complement"; pub const DELETE: &str = "delete"; pub const SQUEEZE: &str = "squeeze-repeats"; - pub const TRUNCATE: &str = "truncate"; + pub const TRUNCATE_SET1: &str = "truncate-set1"; pub const SETS: &str = "sets"; } @@ -44,15 +47,6 @@ struct DeleteOperation { complement: bool, } -impl DeleteOperation { - fn new(set: ExpandSet, complement: bool) -> DeleteOperation { - DeleteOperation { - bset: set.map(|c| c as usize).collect(), - complement, - } - } -} - impl SymbolTranslator for DeleteOperation { fn translate(&self, c: char, _prev_c: char) -> Option { let uc = c as usize; @@ -254,7 +248,7 @@ pub fn uumain(args: impl uucore::Args) -> i32 { let delete_flag = matches.is_present(options::DELETE); let complement_flag = matches.is_present(options::COMPLEMENT) || matches.is_present("C"); let squeeze_flag = matches.is_present(options::SQUEEZE); - let truncate_flag = matches.is_present(options::TRUNCATE); + let truncate_set1_flag = matches.is_present(options::TRUNCATE_SET1); let sets = matches .values_of(options::SETS) @@ -291,21 +285,26 @@ pub fn uumain(args: impl uucore::Args) -> i32 { let op = DeleteAndSqueezeOperation::new(set1, set2, complement_flag); translate_input(&mut locked_stdin, &mut buffered_stdout, op); } else { - let op = DeleteOperation::new(set1, complement_flag); - translate_input(&mut locked_stdin, &mut buffered_stdout, op); + let op = DeleteOperationNew::new(Sequence::parse_set_string(&sets[0]), complement_flag); + translate_input_new(&mut locked_stdin, &mut buffered_stdout, op); } } else if squeeze_flag { if sets.len() < 2 { let op = SqueezeOperation::new(set1, complement_flag); translate_input(&mut locked_stdin, &mut buffered_stdout, op); } else { - let op = TranslateAndSqueezeOperation::new(sets, truncate_flag, complement_flag); + let op = TranslateAndSqueezeOperation::new(sets, truncate_set1_flag, complement_flag); translate_input(&mut locked_stdin, &mut buffered_stdout, op); } } else { - let mut set2 = ExpandSet::new(sets[1].as_ref()); - let op = TranslateOperation::new(set1, &mut set2, truncate_flag, complement_flag); - translate_input(&mut locked_stdin, &mut buffered_stdout, op); + let op = TranslateOperationNew::new( + Sequence::parse_set_string(&sets[0]), + Sequence::parse_set_string(&sets[1]), + truncate_set1_flag, + complement_flag, + ); + println!("op:{:#?}", op); + translate_input_new(&mut locked_stdin, &mut buffered_stdout, op); } 0 @@ -344,8 +343,8 @@ pub fn uu_app() -> App<'static, 'static> { ), ) .arg( - Arg::with_name(options::TRUNCATE) - .long(options::TRUNCATE) + Arg::with_name(options::TRUNCATE_SET1) + .long(options::TRUNCATE_SET1) .short("t") .help("first truncate SET1 to length of SET2"), ) diff --git a/tests/by-util/test_tr.rs b/tests/by-util/test_tr.rs index 8a3e36625..cffb7153f 100644 --- a/tests/by-util/test_tr.rs +++ b/tests/by-util/test_tr.rs @@ -292,3 +292,58 @@ fn test_more_than_2_sets() { .pipe_in("hello world") .fails(); } + +#[test] +fn test_basic_translation() { + new_ucmd!() + .args(&["dabcdef", "xyz"]) + .pipe_in("abcdefabcdef") + .succeeds() + .stdout_is("yzzzzzyzzzzz"); +} + +#[test] +fn test_basic_translation_with_alnum_1() { + new_ucmd!() + .args(&["dabcdef[:alnum:]", "xyz"]) + .pipe_in("abcdefabcdef") + .succeeds() + .stdout_is("zzzzzzzzzzzz"); +} + +#[test] +fn test_basic_translation_with_alnum_2() { + new_ucmd!() + .args(&["[:alnum:]abc", "xyz"]) + .pipe_in("abcdefabcdef") + .succeeds() + .stdout_is("zzzzzzzzzzzz"); +} + +#[test] +fn test_translation_override_pair() { + new_ucmd!() + .args(&["aaa", "xyz"]) + .pipe_in("aaa") + .succeeds() + .stdout_is("zzz"); +} + +#[test] +fn test_translation_case_conversion_works() { + new_ucmd!() + .args(&["abcdefghijklmnopqrstuvwxyz", "ABCDEFGHIJKLMNOPQRSTUVWXYZ"]) + .pipe_in("abcdefghijklmnopqrstuvwxyz") + .succeeds() + .stdout_is("ABCDEFGHIJKLMNOPQRSTUVWXYZ"); + new_ucmd!() + .args(&["a-z", "A-Z"]) + .pipe_in("abcdefghijklmnopqrstuvwxyz") + .succeeds() + .stdout_is("ABCDEFGHIJKLMNOPQRSTUVWXYZ"); + new_ucmd!() + .args(&["[:lower:]", "[:upper:]"]) + .pipe_in("abcdefghijklmnopqrstuvwxyz") + .succeeds() + .stdout_is("ABCDEFGHIJKLMNOPQRSTUVWXYZ"); +}