1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-28 11:37:44 +00:00

tr: Reimplementing set expansion

Hopefully will be feature parity with GNU `tr`.

Signed-off-by: Hanif Bin Ariffin <hanif.ariffin.4326@gmail.com>

Implemented a bit of new expansion module

Signed-off-by: Hanif Bin Ariffin <hanif.ariffin.4326@gmail.com>

Implemented delete operation

Signed-off-by: Hanif Bin Ariffin <hanif.ariffin.4326@gmail.com>

Partially implemented delete operation

Will go through translate next.

Signed-off-by: Hanif Bin Ariffin <hanif.ariffin.4326@gmail.com>

Fix formatting...

Signed-off-by: Hanif Bin Ariffin <hanif.ariffin.4326@gmail.com>

Implemented translation feature

Signed-off-by: Hanif Bin Ariffin <hanif.ariffin.4326@gmail.com>
This commit is contained in:
Hanif Bin Ariffin 2021-07-15 00:04:43 +08:00
parent f9559fea80
commit 840c6e7b91
6 changed files with 527 additions and 26 deletions

47
Cargo.lock generated
View file

@ -119,9 +119,9 @@ checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb"
[[package]]
name = "bitflags"
version = "1.2.1"
version = "1.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
checksum = "2da1976d75adbe5fbc88130ecd119529cf1cc6a93ae1546d8696ee66f0d21af1"
[[package]]
name = "bitvec"
@ -200,7 +200,7 @@ version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db507a7679252d2276ed0dd8113c6875ec56d3089f9225b2b42c30cc1f8e5c89"
dependencies = [
"nom",
"nom 6.1.2",
]
[[package]]
@ -645,9 +645,9 @@ checksum = "0e25ea47919b1560c4e3b7fe0aaab9becf5b84a10325ddf7db0f0ba5e1026499"
[[package]]
name = "digest"
version = "0.6.2"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5b29bf156f3f4b3c4f610a25ff69370616ae6e0657d416de22645483e72af0a"
checksum = "ecae1c064e29fcabb6c2e9939e53dc7da72ed90234ae36ebfe03a478742efbd1"
dependencies = [
"generic-array",
]
@ -937,6 +937,19 @@ version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
[[package]]
name = "lexical-core"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6607c62aa161d23d17a9072cc5da0be67cdfc89d3afb1e8d9c842bebc2525ffe"
dependencies = [
"arrayvec",
"bitflags",
"cfg-if 1.0.0",
"ryu",
"static_assertions",
]
[[package]]
name = "libc"
version = "0.2.85"
@ -1084,6 +1097,17 @@ version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb"
[[package]]
name = "nom"
version = "5.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ffb4262d26ed83a1c0a33a38fe2bb15797329c85770da05e6b828ddb782627af"
dependencies = [
"lexical-core",
"memchr 2.4.0",
"version_check",
]
[[package]]
name = "nom"
version = "6.1.2"
@ -1614,6 +1638,12 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
[[package]]
name = "ryu"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e"
[[package]]
name = "same-file"
version = "1.0.6"
@ -1754,6 +1784,12 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
[[package]]
name = "static_assertions"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]]
name = "strsim"
version = "0.8.0"
@ -2910,6 +2946,7 @@ dependencies = [
"bit-set",
"clap",
"fnv",
"nom 5.1.2",
"uucore",
"uucore_procs",
]

View file

@ -15,7 +15,7 @@ edition = "2018"
path = "src/hashsum.rs"
[dependencies]
digest = "0.6.2"
digest = "0.6.1"
clap = { version = "2.33", features = ["wrap_help"] }
hex = "0.2.0"
libc = "0.2.42"

View file

@ -20,6 +20,7 @@ fnv = "1.0.5"
clap = { version = "2.33", features = ["wrap_help"] }
uucore = { version=">=0.0.9", package="uucore", path="../../uucore" }
uucore_procs = { version=">=0.0.6", package="uucore_procs", path="../../uucore_procs" }
nom = "5.1.2"
[[bin]]
name = "tr"

409
src/uu/tr/src/operation.rs Normal file
View file

@ -0,0 +1,409 @@
use nom::{
branch::alt,
bytes::complete::{tag, take, take_until},
character::complete::one_of,
multi::many0,
sequence::{separated_pair, tuple},
IResult,
};
use std::{
collections::HashMap,
io::{BufRead, Write},
};
#[derive(Debug, PartialEq, Eq, Clone)]
pub enum Sequence {
Char(char),
CharRange(Vec<char>),
}
impl Sequence {
pub fn parse_set_string(input: &str) -> Vec<Sequence> {
many0(alt((
alt((
Sequence::parse_octal,
Sequence::parse_backslash,
Sequence::parse_audible_bel,
Sequence::parse_backspace,
Sequence::parse_form_feed,
Sequence::parse_newline,
Sequence::parse_return,
Sequence::parse_horizontal_tab,
Sequence::parse_vertical_tab,
)),
alt((
Sequence::parse_char_range,
Sequence::parse_char_star,
Sequence::parse_char_repeat,
)),
alt((
Sequence::parse_alnum,
Sequence::parse_alpha,
Sequence::parse_blank,
Sequence::parse_control,
Sequence::parse_digit,
Sequence::parse_graph,
Sequence::parse_lower,
Sequence::parse_print,
Sequence::parse_punct,
Sequence::parse_space,
Sequence::parse_space,
Sequence::parse_upper,
Sequence::parse_xdigit,
Sequence::parse_char_equal,
Sequence::parse_char,
)),
)))(input)
.map(|(_, r)| r)
.unwrap()
}
pub fn dissolve(self) -> Vec<char> {
match self {
Sequence::Char(c) => vec![c],
Sequence::CharRange(r) => r,
}
}
/// Sequence parsers
fn parse_char(input: &str) -> IResult<&str, Sequence> {
take(1usize)(input).map(|(l, r)| (l, Sequence::Char(r.chars().next().unwrap())))
}
fn parse_octal(input: &str) -> IResult<&str, Sequence> {
tuple((
tag("\\"),
one_of("01234567"),
one_of("01234567"),
one_of("01234567"),
))(input)
.map(|(l, (_, a, b, c))| {
(
l,
Sequence::Char(
// SAFETY: All the values from \000 to \777 is valid based on a test below...
std::char::from_u32(
a.to_digit(8).unwrap() * 8 * 8
+ b.to_digit(8).unwrap() * 8
+ c.to_digit(8).unwrap(),
)
.unwrap(),
),
)
})
}
fn parse_backslash(input: &str) -> IResult<&str, Sequence> {
tuple((tag("\\"), tag("\\")))(input).map(|(l, _)| (l, Sequence::Char('\\')))
}
fn parse_audible_bel(input: &str) -> IResult<&str, Sequence> {
tuple((tag("\\"), tag("a")))(input).map(|(l, _)| (l, Sequence::Char('\u{0007}')))
}
fn parse_backspace(input: &str) -> IResult<&str, Sequence> {
tuple((tag("\\"), tag("b")))(input).map(|(l, _)| (l, Sequence::Char('\u{0008}')))
}
fn parse_form_feed(input: &str) -> IResult<&str, Sequence> {
tuple((tag("\\"), tag("f")))(input).map(|(l, _)| (l, Sequence::Char('\u{000C}')))
}
fn parse_newline(input: &str) -> IResult<&str, Sequence> {
tuple((tag("\\"), tag("n")))(input).map(|(l, _)| (l, Sequence::Char('\u{000A}')))
}
fn parse_return(input: &str) -> IResult<&str, Sequence> {
tuple((tag("\\"), tag("r")))(input).map(|(l, _)| (l, Sequence::Char('\u{000D}')))
}
fn parse_horizontal_tab(input: &str) -> IResult<&str, Sequence> {
tuple((tag("\\"), tag("t")))(input).map(|(l, _)| (l, Sequence::Char('\u{0009}')))
}
fn parse_vertical_tab(input: &str) -> IResult<&str, Sequence> {
tuple((tag("\\"), tag("v")))(input).map(|(l, _)| (l, Sequence::Char('\u{000B}')))
}
fn parse_char_range(input: &str) -> IResult<&str, Sequence> {
separated_pair(take(1usize), tag("-"), take(1usize))(input).map(|(l, (a, b))| {
(l, {
let (start, end) = (
u32::from(a.chars().next().unwrap()),
u32::from(b.chars().next().unwrap()),
);
if (start >= 97 && start <= 122 && end >= 97 && end <= 122 && end > start)
|| (start >= 65 && start <= 90 && end >= 65 && end <= 90 && end > start)
|| (start >= 48 && start <= 57 && end >= 48 && end <= 57 && end > start)
{
Sequence::CharRange(
(start..=end)
.map(|c| std::char::from_u32(c).unwrap())
.collect(),
)
} else {
// This part is unchecked...not all `u32` => `char` is valid
Sequence::CharRange(
(start..=end)
.map(|c| std::char::from_u32(c).unwrap())
.collect(),
)
}
})
})
}
fn parse_char_star(input: &str) -> IResult<&str, Sequence> {
tuple((tag("["), take(1usize), tag("*"), tag("]")))(input).map(|(_, (_, _, _, _))| todo!())
}
fn parse_char_repeat(input: &str) -> IResult<&str, Sequence> {
tuple((tag("["), take(1usize), tag("*"), take_until("]"), tag("]")))(input).map(
|(l, (_, c, _, n, _))| {
(
l,
Sequence::CharRange(
std::iter::repeat(c.chars().next().unwrap())
.take(n.parse().unwrap())
.collect(),
),
)
},
)
}
fn parse_alnum(input: &str) -> IResult<&str, Sequence> {
tag("[:alnum:]")(input).map(|(l, _)| {
(
l,
Sequence::CharRange(('a'..='z').chain('A'..'Z').chain('0'..'9').collect()),
)
})
}
fn parse_alpha(input: &str) -> IResult<&str, Sequence> {
tag("[:alpha:]")(input).map(|(l, _)| {
(
l,
Sequence::CharRange(('a'..='z').chain('A'..'Z').collect()),
)
})
}
fn parse_blank(input: &str) -> IResult<&str, Sequence> {
tag("[:blank:]")(input).map(|(_, _)| todo!())
}
fn parse_control(input: &str) -> IResult<&str, Sequence> {
tag("[:cntrl:]")(input).map(|(_, _)| todo!())
}
fn parse_digit(input: &str) -> IResult<&str, Sequence> {
tag("[:digit:]")(input).map(|(l, _)| (l, Sequence::CharRange(('0'..='9').collect())))
}
fn parse_graph(input: &str) -> IResult<&str, Sequence> {
tag("[:graph:]")(input).map(|(_, _)| todo!())
}
fn parse_lower(input: &str) -> IResult<&str, Sequence> {
tag("[:lower:]")(input).map(|(_, _)| todo!())
}
fn parse_print(input: &str) -> IResult<&str, Sequence> {
tag("[:print:]")(input).map(|(_, _)| todo!())
}
fn parse_punct(input: &str) -> IResult<&str, Sequence> {
tag("[:punct:]")(input).map(|(_, _)| todo!())
}
fn parse_space(input: &str) -> IResult<&str, Sequence> {
tag("[:space:]")(input).map(|(_, _)| todo!())
}
fn parse_upper(input: &str) -> IResult<&str, Sequence> {
tag("[:upper:]")(input).map(|(l, _)| (l, Sequence::CharRange(('A'..='Z').collect())))
}
fn parse_xdigit(input: &str) -> IResult<&str, Sequence> {
tag("[:xdigit:]")(input).map(|(_, _)| todo!())
}
fn parse_char_equal(input: &str) -> IResult<&str, Sequence> {
tuple((tag("[="), take(1usize), tag("=]")))(input).map(|(_, (_, _, _))| todo!())
}
}
pub trait SymbolTranslatorNew {
fn translate(&mut self, current: char) -> Option<char>;
}
#[derive(Debug, Clone)]
pub struct DeleteOperationNew {
set: Vec<Sequence>,
complement_flag: bool,
}
impl DeleteOperationNew {
pub fn new(set: Vec<Sequence>, complement_flag: bool) -> DeleteOperationNew {
DeleteOperationNew {
set,
complement_flag,
}
}
}
impl SymbolTranslatorNew for DeleteOperationNew {
fn translate(&mut self, current: char) -> Option<char> {
let found = self.set.iter().any(|sequence| match sequence {
Sequence::Char(c) => c.eq(&current),
Sequence::CharRange(r) => r.iter().any(|c| c.eq(&current)),
});
(self.complement_flag == found).then(|| current)
}
}
#[derive(Debug, Clone)]
pub enum TranslateOperationNew {
Standard(HashMap<char, char>),
Complement(Vec<char>, Vec<char>, HashMap<char, char>, char),
}
impl TranslateOperationNew {
pub fn new(
set1: Vec<Sequence>,
mut set2: Vec<Sequence>,
truncate_set2: bool,
complement: bool,
) -> TranslateOperationNew {
let fallback = set2.last().cloned().unwrap();
if truncate_set2 {
set2.truncate(set1.len());
}
if complement {
TranslateOperationNew::Complement(
set1.into_iter()
.flat_map(Sequence::dissolve)
.rev()
.collect(),
set2.into_iter()
.flat_map(Sequence::dissolve)
.rev()
.collect(),
HashMap::new(),
// TODO: Check how `tr` actually handles this
fallback.dissolve().first().cloned().unwrap(),
)
} else {
TranslateOperationNew::Standard(
set1.into_iter()
.flat_map(Sequence::dissolve)
.zip(
set2.into_iter()
.chain(std::iter::repeat(fallback))
.flat_map(Sequence::dissolve),
)
.collect::<HashMap<_, _>>(),
)
}
}
}
impl SymbolTranslatorNew for TranslateOperationNew {
fn translate(&mut self, current: char) -> Option<char> {
match self {
TranslateOperationNew::Standard(map) => Some(
map.iter()
.find_map(|(l, r)| l.eq(&current).then(|| *r))
.unwrap_or(current),
),
TranslateOperationNew::Complement(set1, set2, mapped_characters, fallback) => {
// First, see if we have already mapped this character.
// If so, return it.
// Else, check if current character is part of set1
// If so, return it.
// Else, consume from set2, create the translation pair, and return the mapped character
match mapped_characters.get(&current) {
Some(k) => Some(*k),
None => match set1.iter().any(|c| c.eq(&&current)) {
true => Some(current),
false => {
let popped = set2.pop().unwrap_or(*fallback);
mapped_characters.insert(current, popped);
Some(popped)
}
},
}
}
}
}
}
pub fn translate_input_new<T>(input: &mut dyn BufRead, output: &mut dyn Write, mut translator: T)
where
T: SymbolTranslatorNew,
{
let mut buf = String::new();
let mut output_buf = String::new();
while let Ok(length) = input.read_line(&mut buf) {
if length == 0 {
break;
} else {
let filtered = buf.chars().filter_map(|c| translator.translate(c));
output_buf.extend(filtered);
output.write_all(output_buf.as_bytes()).unwrap();
}
buf.clear();
output_buf.clear();
}
}
#[test]
fn test_parse_char_range() {
assert_eq!(Sequence::parse_set_string(""), vec![]);
assert_eq!(
Sequence::parse_set_string("a-z"),
vec![Sequence::CharRange(vec![
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
])]
);
assert_eq!(
Sequence::parse_set_string("a-zA-Z"),
vec![
Sequence::CharRange(vec![
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
]),
Sequence::CharRange(vec![
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
])
]
);
assert_eq!(
Sequence::parse_set_string(", ┬─┬"),
vec![
Sequence::Char(','),
Sequence::Char(' '),
Sequence::Char('┬'),
Sequence::Char('─'),
Sequence::Char('┬')
]
);
}
#[test]
fn test_parse_octal() {
for a in '0'..='7' {
for b in '0'..='7' {
for c in '0'..='7' {
assert!(
Sequence::parse_set_string(format!("\\{}{}{}", a, b, c).as_str()).len() == 1
);
}
}
}
}

View file

@ -12,15 +12,18 @@
#[macro_use]
extern crate uucore;
extern crate nom;
mod expand;
mod operation;
use bit_set::BitSet;
use clap::{crate_version, App, Arg};
use fnv::FnvHashMap;
use operation::{translate_input_new, Sequence, TranslateOperationNew};
use std::io::{stdin, stdout, BufRead, BufWriter, Write};
use crate::expand::ExpandSet;
use crate::{expand::ExpandSet, operation::DeleteOperationNew};
use uucore::InvalidEncodingHandling;
static ABOUT: &str = "translate or delete characters";
@ -31,7 +34,7 @@ mod options {
pub const COMPLEMENT: &str = "complement";
pub const DELETE: &str = "delete";
pub const SQUEEZE: &str = "squeeze-repeats";
pub const TRUNCATE: &str = "truncate";
pub const TRUNCATE_SET1: &str = "truncate-set1";
pub const SETS: &str = "sets";
}
@ -44,15 +47,6 @@ struct DeleteOperation {
complement: bool,
}
impl DeleteOperation {
fn new(set: ExpandSet, complement: bool) -> DeleteOperation {
DeleteOperation {
bset: set.map(|c| c as usize).collect(),
complement,
}
}
}
impl SymbolTranslator for DeleteOperation {
fn translate(&self, c: char, _prev_c: char) -> Option<char> {
let uc = c as usize;
@ -254,7 +248,7 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
let delete_flag = matches.is_present(options::DELETE);
let complement_flag = matches.is_present(options::COMPLEMENT) || matches.is_present("C");
let squeeze_flag = matches.is_present(options::SQUEEZE);
let truncate_flag = matches.is_present(options::TRUNCATE);
let truncate_set1_flag = matches.is_present(options::TRUNCATE_SET1);
let sets = matches
.values_of(options::SETS)
@ -291,21 +285,26 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
let op = DeleteAndSqueezeOperation::new(set1, set2, complement_flag);
translate_input(&mut locked_stdin, &mut buffered_stdout, op);
} else {
let op = DeleteOperation::new(set1, complement_flag);
translate_input(&mut locked_stdin, &mut buffered_stdout, op);
let op = DeleteOperationNew::new(Sequence::parse_set_string(&sets[0]), complement_flag);
translate_input_new(&mut locked_stdin, &mut buffered_stdout, op);
}
} else if squeeze_flag {
if sets.len() < 2 {
let op = SqueezeOperation::new(set1, complement_flag);
translate_input(&mut locked_stdin, &mut buffered_stdout, op);
} else {
let op = TranslateAndSqueezeOperation::new(sets, truncate_flag, complement_flag);
let op = TranslateAndSqueezeOperation::new(sets, truncate_set1_flag, complement_flag);
translate_input(&mut locked_stdin, &mut buffered_stdout, op);
}
} else {
let mut set2 = ExpandSet::new(sets[1].as_ref());
let op = TranslateOperation::new(set1, &mut set2, truncate_flag, complement_flag);
translate_input(&mut locked_stdin, &mut buffered_stdout, op);
let op = TranslateOperationNew::new(
Sequence::parse_set_string(&sets[0]),
Sequence::parse_set_string(&sets[1]),
truncate_set1_flag,
complement_flag,
);
println!("op:{:#?}", op);
translate_input_new(&mut locked_stdin, &mut buffered_stdout, op);
}
0
@ -344,8 +343,8 @@ pub fn uu_app() -> App<'static, 'static> {
),
)
.arg(
Arg::with_name(options::TRUNCATE)
.long(options::TRUNCATE)
Arg::with_name(options::TRUNCATE_SET1)
.long(options::TRUNCATE_SET1)
.short("t")
.help("first truncate SET1 to length of SET2"),
)

View file

@ -292,3 +292,58 @@ fn test_more_than_2_sets() {
.pipe_in("hello world")
.fails();
}
#[test]
fn test_basic_translation() {
new_ucmd!()
.args(&["dabcdef", "xyz"])
.pipe_in("abcdefabcdef")
.succeeds()
.stdout_is("yzzzzzyzzzzz");
}
#[test]
fn test_basic_translation_with_alnum_1() {
new_ucmd!()
.args(&["dabcdef[:alnum:]", "xyz"])
.pipe_in("abcdefabcdef")
.succeeds()
.stdout_is("zzzzzzzzzzzz");
}
#[test]
fn test_basic_translation_with_alnum_2() {
new_ucmd!()
.args(&["[:alnum:]abc", "xyz"])
.pipe_in("abcdefabcdef")
.succeeds()
.stdout_is("zzzzzzzzzzzz");
}
#[test]
fn test_translation_override_pair() {
new_ucmd!()
.args(&["aaa", "xyz"])
.pipe_in("aaa")
.succeeds()
.stdout_is("zzz");
}
#[test]
fn test_translation_case_conversion_works() {
new_ucmd!()
.args(&["abcdefghijklmnopqrstuvwxyz", "ABCDEFGHIJKLMNOPQRSTUVWXYZ"])
.pipe_in("abcdefghijklmnopqrstuvwxyz")
.succeeds()
.stdout_is("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
new_ucmd!()
.args(&["a-z", "A-Z"])
.pipe_in("abcdefghijklmnopqrstuvwxyz")
.succeeds()
.stdout_is("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
new_ucmd!()
.args(&["[:lower:]", "[:upper:]"])
.pipe_in("abcdefghijklmnopqrstuvwxyz")
.succeeds()
.stdout_is("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
}