1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-28 03:27:44 +00:00

tr: operate on bytes instead of chars

This commit is contained in:
Terts Diepraam 2023-12-11 13:35:17 +01:00
parent 4442b35370
commit bc5b5e013a
5 changed files with 115 additions and 151 deletions

View file

@ -1,36 +0,0 @@
// This file is part of the uutils coreutils package.
//
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.
// spell-checker:ignore (strings) anychar combinator
use nom::{
branch::alt,
bytes::complete::tag,
character::complete::{anychar, one_of},
combinator::{map_opt, recognize},
multi::{many0, many_m_n},
sequence::preceded,
IResult,
};
fn parse_octal(input: &str) -> IResult<&str, char> {
map_opt(
preceded(tag("\\"), recognize(many_m_n(1, 3, one_of("01234567")))),
|out: &str| {
u32::from_str_radix(out, 8)
.map(std::char::from_u32)
.ok()
.flatten()
},
)(input)
}
pub fn reduce_octal_to_char(input: &str) -> String {
many0(alt((parse_octal, anychar)))(input)
.map(|(_, r)| r)
.unwrap()
.into_iter()
.collect()
}

View file

@ -7,10 +7,10 @@
use nom::{ use nom::{
branch::alt, branch::alt,
bytes::complete::tag, bytes::complete::{tag, take},
character::complete::{anychar, digit1}, character::complete::{digit1, one_of},
combinator::{map, peek, value}, combinator::{map, map_opt, peek, recognize, value},
multi::many0, multi::{many0, many_m_n},
sequence::{delimited, preceded, separated_pair}, sequence::{delimited, preceded, separated_pair},
IResult, IResult,
}; };
@ -62,10 +62,10 @@ impl UError for BadSequence {}
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
pub enum Sequence { pub enum Sequence {
Char(char), Char(u8),
CharRange(u32, u32), CharRange(u8, u8),
CharStar(char), CharStar(u8),
CharRepeat(char, usize), CharRepeat(u8, usize),
Alnum, Alnum,
Alpha, Alpha,
Blank, Blank,
@ -81,21 +81,17 @@ pub enum Sequence {
} }
impl Sequence { impl Sequence {
pub fn flatten(&self) -> Box<dyn Iterator<Item = char>> { pub fn flatten(&self) -> Box<dyn Iterator<Item = u8>> {
match self { match self {
Self::Char(c) => Box::new(std::iter::once(*c)), Self::Char(c) => Box::new(std::iter::once(*c)),
Self::CharRange(l, r) => Box::new((*l..=*r).flat_map(std::char::from_u32)), Self::CharRange(l, r) => Box::new(*l..=*r),
Self::CharStar(c) => Box::new(std::iter::repeat(*c)), Self::CharStar(c) => Box::new(std::iter::repeat(*c)),
Self::CharRepeat(c, n) => Box::new(std::iter::repeat(*c).take(*n)), Self::CharRepeat(c, n) => Box::new(std::iter::repeat(*c).take(*n)),
Self::Alnum => Box::new(('0'..='9').chain('A'..='Z').chain('a'..='z')), Self::Alnum => Box::new((b'0'..=b'9').chain(b'A'..=b'Z').chain(b'a'..=b'z')),
Self::Alpha => Box::new(('A'..='Z').chain('a'..='z')), Self::Alpha => Box::new((b'A'..=b'Z').chain(b'a'..=b'z')),
Self::Blank => Box::new(unicode_table::BLANK.iter().cloned()), Self::Blank => Box::new(unicode_table::BLANK.iter().cloned()),
Self::Control => Box::new( Self::Control => Box::new((0..=31).chain(std::iter::once(127))),
(0..=31) Self::Digit => Box::new(b'0'..=b'9'),
.chain(std::iter::once(127))
.flat_map(std::char::from_u32),
),
Self::Digit => Box::new('0'..='9'),
Self::Graph => Box::new( Self::Graph => Box::new(
(48..=57) // digit (48..=57) // digit
.chain(65..=90) // uppercase .chain(65..=90) // uppercase
@ -105,10 +101,9 @@ impl Sequence {
.chain(58..=64) .chain(58..=64)
.chain(91..=96) .chain(91..=96)
.chain(123..=126) .chain(123..=126)
.chain(std::iter::once(32)) // space .chain(std::iter::once(32)), // space
.flat_map(std::char::from_u32),
), ),
Self::Lower => Box::new('a'..='z'), Self::Lower => Box::new(b'a'..=b'z'),
Self::Print => Box::new( Self::Print => Box::new(
(48..=57) // digit (48..=57) // digit
.chain(65..=90) // uppercase .chain(65..=90) // uppercase
@ -117,29 +112,22 @@ impl Sequence {
.chain(33..=47) .chain(33..=47)
.chain(58..=64) .chain(58..=64)
.chain(91..=96) .chain(91..=96)
.chain(123..=126) .chain(123..=126),
.flat_map(std::char::from_u32),
),
Self::Punct => Box::new(
(33..=47)
.chain(58..=64)
.chain(91..=96)
.chain(123..=126)
.flat_map(std::char::from_u32),
), ),
Self::Punct => Box::new((33..=47).chain(58..=64).chain(91..=96).chain(123..=126)),
Self::Space => Box::new(unicode_table::SPACES.iter().cloned()), Self::Space => Box::new(unicode_table::SPACES.iter().cloned()),
Self::Upper => Box::new('A'..='Z'), Self::Upper => Box::new(b'A'..=b'Z'),
Self::Xdigit => Box::new(('0'..='9').chain('A'..='F').chain('a'..='f')), Self::Xdigit => Box::new((b'0'..=b'9').chain(b'A'..=b'F').chain(b'a'..=b'f')),
} }
} }
// Hide all the nasty sh*t in here // Hide all the nasty sh*t in here
// TODO: Make the 2 set lazily generate the character mapping as necessary. // TODO: Make the 2 set lazily generate the character mapping as necessary.
pub fn solve_set_characters( pub fn solve_set_characters(
set1_str: &str, set1_str: &[u8],
set2_str: &str, set2_str: &[u8],
truncate_set1_flag: bool, truncate_set1_flag: bool,
) -> Result<(Vec<char>, Vec<char>), BadSequence> { ) -> Result<(Vec<u8>, Vec<u8>), BadSequence> {
let set1 = Self::from_str(set1_str)?; let set1 = Self::from_str(set1_str)?;
let set2 = Self::from_str(set2_str)?; let set2 = Self::from_str(set2_str)?;
@ -164,7 +152,7 @@ impl Sequence {
.count(); .count();
let star_compensate_len = set1_len.saturating_sub(set2_len); let star_compensate_len = set1_len.saturating_sub(set2_len);
let (left, right) = (partition.next(), partition.next()); let (left, right) = (partition.next(), partition.next());
let set2_solved: Vec<char> = match (left, right) { let set2_solved: Vec<_> = match (left, right) {
(None, None) => match char_star { (None, None) => match char_star {
Some(c) => std::iter::repeat(*c).take(star_compensate_len).collect(), Some(c) => std::iter::repeat(*c).take(star_compensate_len).collect(),
None => std::iter::empty().collect(), None => std::iter::empty().collect(),
@ -201,7 +189,7 @@ impl Sequence {
.collect(), .collect(),
}, },
}; };
let mut set1_solved: Vec<char> = set1.iter().flat_map(Self::flatten).collect(); let mut set1_solved: Vec<_> = set1.iter().flat_map(Self::flatten).collect();
if truncate_set1_flag { if truncate_set1_flag {
set1_solved.truncate(set2_solved.len()); set1_solved.truncate(set2_solved.len());
} }
@ -216,7 +204,7 @@ impl Sequence {
} }
impl Sequence { impl Sequence {
pub fn from_str(input: &str) -> Result<Vec<Self>, BadSequence> { pub fn from_str(input: &[u8]) -> Result<Vec<Self>, BadSequence> {
many0(alt(( many0(alt((
Self::parse_char_range, Self::parse_char_range,
Self::parse_char_star, Self::parse_char_star,
@ -232,27 +220,38 @@ impl Sequence {
.collect::<Result<Vec<_>, _>>() .collect::<Result<Vec<_>, _>>()
} }
fn parse_backslash(input: &str) -> IResult<&str, char> { fn parse_octal(input: &[u8]) -> IResult<&[u8], u8> {
preceded(tag("\\"), anychar)(input).map(|(l, a)| { map_opt(
preceded(tag("\\"), recognize(many_m_n(1, 3, one_of("01234567")))),
|out: &[u8]| u8::from_str_radix(std::str::from_utf8(out).expect("boop"), 8).ok(),
)(input)
}
fn parse_backslash(input: &[u8]) -> IResult<&[u8], u8> {
preceded(tag("\\"), Self::single_char)(input).map(|(l, a)| {
let c = match a { let c = match a {
'a' => unicode_table::BEL, b'a' => unicode_table::BEL,
'b' => unicode_table::BS, b'b' => unicode_table::BS,
'f' => unicode_table::FF, b'f' => unicode_table::FF,
'n' => unicode_table::LF, b'n' => unicode_table::LF,
'r' => unicode_table::CR, b'r' => unicode_table::CR,
't' => unicode_table::HT, b't' => unicode_table::HT,
'v' => unicode_table::VT, b'v' => unicode_table::VT,
x => x, x => x,
}; };
(l, c) (l, c)
}) })
} }
fn parse_backslash_or_char(input: &str) -> IResult<&str, char> { fn parse_backslash_or_char(input: &[u8]) -> IResult<&[u8], u8> {
alt((Self::parse_backslash, anychar))(input) alt((Self::parse_octal, Self::parse_backslash, Self::single_char))(input)
} }
fn parse_char_range(input: &str) -> IResult<&str, Result<Self, BadSequence>> { fn single_char(input: &[u8]) -> IResult<&[u8], u8> {
take(1usize)(input).map(|(l, a)| (l, a[0]))
}
fn parse_char_range(input: &[u8]) -> IResult<&[u8], Result<Self, BadSequence>> {
separated_pair( separated_pair(
Self::parse_backslash_or_char, Self::parse_backslash_or_char,
tag("-"), tag("-"),
@ -261,41 +260,42 @@ impl Sequence {
.map(|(l, (a, b))| { .map(|(l, (a, b))| {
(l, { (l, {
let (start, end) = (u32::from(a), u32::from(b)); let (start, end) = (u32::from(a), u32::from(b));
Ok(Self::CharRange(start, end)) Ok(Self::CharRange(start as u8, end as u8))
}) })
}) })
} }
fn parse_char_star(input: &str) -> IResult<&str, Result<Self, BadSequence>> { fn parse_char_star(input: &[u8]) -> IResult<&[u8], Result<Self, BadSequence>> {
delimited(tag("["), Self::parse_backslash_or_char, tag("*]"))(input) delimited(tag("["), Self::parse_backslash_or_char, tag("*]"))(input)
.map(|(l, a)| (l, Ok(Self::CharStar(a)))) .map(|(l, a)| (l, Ok(Self::CharStar(a))))
} }
fn parse_char_repeat(input: &str) -> IResult<&str, Result<Self, BadSequence>> { fn parse_char_repeat(input: &[u8]) -> IResult<&[u8], Result<Self, BadSequence>> {
delimited( delimited(
tag("["), tag("["),
separated_pair(Self::parse_backslash_or_char, tag("*"), digit1), separated_pair(Self::parse_backslash_or_char, tag("*"), digit1),
tag("]"), tag("]"),
)(input) )(input)
.map(|(l, (c, cnt_str))| { .map(|(l, (c, cnt_str))| {
let result = if cnt_str.starts_with('0') { let s = String::from_utf8_lossy(cnt_str);
match usize::from_str_radix(cnt_str, 8) { let result = if cnt_str.starts_with(b"0") {
match usize::from_str_radix(&s, 8) {
Ok(0) => Ok(Self::CharStar(c)), Ok(0) => Ok(Self::CharStar(c)),
Ok(count) => Ok(Self::CharRepeat(c, count)), Ok(count) => Ok(Self::CharRepeat(c, count)),
Err(_) => Err(BadSequence::InvalidRepeatCount(cnt_str.to_string())), Err(_) => Err(BadSequence::InvalidRepeatCount(s.to_string())),
} }
} else { } else {
match cnt_str.parse::<usize>() { match s.parse::<usize>() {
Ok(0) => Ok(Self::CharStar(c)), Ok(0) => Ok(Self::CharStar(c)),
Ok(count) => Ok(Self::CharRepeat(c, count)), Ok(count) => Ok(Self::CharRepeat(c, count)),
Err(_) => Err(BadSequence::InvalidRepeatCount(cnt_str.to_string())), Err(_) => Err(BadSequence::InvalidRepeatCount(s.to_string())),
} }
}; };
(l, result) (l, result)
}) })
} }
fn parse_class(input: &str) -> IResult<&str, Result<Self, BadSequence>> { fn parse_class(input: &[u8]) -> IResult<&[u8], Result<Self, BadSequence>> {
delimited( delimited(
tag("[:"), tag("[:"),
alt(( alt((
@ -322,7 +322,7 @@ impl Sequence {
)(input) )(input)
} }
fn parse_char_equal(input: &str) -> IResult<&str, Result<Self, BadSequence>> { fn parse_char_equal(input: &[u8]) -> IResult<&[u8], Result<Self, BadSequence>> {
delimited( delimited(
tag("[="), tag("[="),
alt(( alt((
@ -338,17 +338,17 @@ impl Sequence {
} }
pub trait SymbolTranslator { pub trait SymbolTranslator {
fn translate(&mut self, current: char) -> Option<char>; fn translate(&mut self, current: u8) -> Option<u8>;
} }
#[derive(Debug)] #[derive(Debug)]
pub struct DeleteOperation { pub struct DeleteOperation {
set: Vec<char>, set: Vec<u8>,
complement_flag: bool, complement_flag: bool,
} }
impl DeleteOperation { impl DeleteOperation {
pub fn new(set: Vec<char>, complement_flag: bool) -> Self { pub fn new(set: Vec<u8>, complement_flag: bool) -> Self {
Self { Self {
set, set,
complement_flag, complement_flag,
@ -357,8 +357,8 @@ impl DeleteOperation {
} }
impl SymbolTranslator for DeleteOperation { impl SymbolTranslator for DeleteOperation {
fn translate(&mut self, current: char) -> Option<char> { fn translate(&mut self, current: u8) -> Option<u8> {
let found = self.set.iter().any(|sequence| sequence.eq(&current)); let found = self.set.iter().any(|sequence| *sequence == current);
if self.complement_flag == found { if self.complement_flag == found {
Some(current) Some(current)
} else { } else {
@ -368,15 +368,15 @@ impl SymbolTranslator for DeleteOperation {
} }
pub struct TranslateOperationComplement { pub struct TranslateOperationComplement {
iter: u32, iter: u8,
set2_iter: usize, set2_iter: usize,
set1: Vec<char>, set1: Vec<u8>,
set2: Vec<char>, set2: Vec<u8>,
translation_map: HashMap<char, char>, translation_map: HashMap<u8, u8>,
} }
impl TranslateOperationComplement { impl TranslateOperationComplement {
fn new(set1: Vec<char>, set2: Vec<char>) -> Self { fn new(set1: Vec<u8>, set2: Vec<u8>) -> Self {
Self { Self {
iter: 0, iter: 0,
set2_iter: 0, set2_iter: 0,
@ -389,11 +389,11 @@ impl TranslateOperationComplement {
#[derive(Debug)] #[derive(Debug)]
pub struct TranslateOperationStandard { pub struct TranslateOperationStandard {
translation_map: HashMap<char, char>, translation_map: HashMap<u8, u8>,
} }
impl TranslateOperationStandard { impl TranslateOperationStandard {
fn new(set1: Vec<char>, set2: Vec<char>) -> Result<Self, BadSequence> { fn new(set1: Vec<u8>, set2: Vec<u8>) -> Result<Self, BadSequence> {
if let Some(fallback) = set2.last().copied() { if let Some(fallback) = set2.last().copied() {
Ok(Self { Ok(Self {
translation_map: set1 translation_map: set1
@ -417,18 +417,17 @@ pub enum TranslateOperation {
} }
impl TranslateOperation { impl TranslateOperation {
fn next_complement_char(iter: u32, ignore_list: &[char]) -> (u32, char) { fn next_complement_char(iter: u8, ignore_list: &[u8]) -> (u8, u8) {
(iter..) (iter..)
.filter_map(std::char::from_u32) .filter(|c| !ignore_list.iter().any(|s| s == c))
.filter(|c| !ignore_list.iter().any(|s| s.eq(c))) .map(|c| (c + 1, c))
.map(|c| (u32::from(c) + 1, c))
.next() .next()
.expect("exhausted all possible characters") .expect("exhausted all possible characters")
} }
} }
impl TranslateOperation { impl TranslateOperation {
pub fn new(set1: Vec<char>, set2: Vec<char>, complement: bool) -> Result<Self, BadSequence> { pub fn new(set1: Vec<u8>, set2: Vec<u8>, complement: bool) -> Result<Self, BadSequence> {
if complement { if complement {
Ok(Self::Complement(TranslateOperationComplement::new( Ok(Self::Complement(TranslateOperationComplement::new(
set1, set2, set1, set2,
@ -440,7 +439,7 @@ impl TranslateOperation {
} }
impl SymbolTranslator for TranslateOperation { impl SymbolTranslator for TranslateOperation {
fn translate(&mut self, current: char) -> Option<char> { fn translate(&mut self, current: u8) -> Option<u8> {
match self { match self {
Self::Standard(TranslateOperationStandard { translation_map }) => Some( Self::Standard(TranslateOperationStandard { translation_map }) => Some(
translation_map translation_map
@ -482,13 +481,13 @@ impl SymbolTranslator for TranslateOperation {
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct SqueezeOperation { pub struct SqueezeOperation {
set1: HashSet<char>, set1: HashSet<u8>,
complement: bool, complement: bool,
previous: Option<char>, previous: Option<u8>,
} }
impl SqueezeOperation { impl SqueezeOperation {
pub fn new(set1: Vec<char>, complement: bool) -> Self { pub fn new(set1: Vec<u8>, complement: bool) -> Self {
Self { Self {
set1: set1.into_iter().collect(), set1: set1.into_iter().collect(),
complement, complement,
@ -498,7 +497,7 @@ impl SqueezeOperation {
} }
impl SymbolTranslator for SqueezeOperation { impl SymbolTranslator for SqueezeOperation {
fn translate(&mut self, current: char) -> Option<char> { fn translate(&mut self, current: u8) -> Option<u8> {
if self.complement { if self.complement {
let next = if self.set1.contains(&current) { let next = if self.set1.contains(&current) {
Some(current) Some(current)
@ -537,15 +536,15 @@ where
R: BufRead, R: BufRead,
W: Write, W: Write,
{ {
let mut buf = String::new(); let mut buf = Vec::new();
let mut output_buf = String::new(); let mut output_buf = Vec::new();
while let Ok(length) = input.read_line(&mut buf) { while let Ok(length) = input.read_until(b'\n', &mut buf) {
if length == 0 { if length == 0 {
break; break;
} else { } else {
let filtered = buf.chars().filter_map(|c| translator.translate(c)); let filtered = buf.iter().filter_map(|c| translator.translate(*c));
output_buf.extend(filtered); output_buf.extend(filtered);
output.write_all(output_buf.as_bytes()).unwrap(); output.write_all(&output_buf).unwrap();
} }
buf.clear(); buf.clear();
output_buf.clear(); output_buf.clear();

View file

@ -5,7 +5,6 @@
// spell-checker:ignore (ToDO) allocs bset dflag cflag sflag tflag // spell-checker:ignore (ToDO) allocs bset dflag cflag sflag tflag
mod convert;
mod operation; mod operation;
mod unicode_table; mod unicode_table;
@ -42,14 +41,15 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
let squeeze_flag = matches.get_flag(options::SQUEEZE); let squeeze_flag = matches.get_flag(options::SQUEEZE);
let truncate_set1_flag = matches.get_flag(options::TRUNCATE_SET1); let truncate_set1_flag = matches.get_flag(options::TRUNCATE_SET1);
let sets = matches // Ultimately this should be OsString, but we might want to wait for the
// pattern API on OsStr
let sets: Vec<_> = matches
.get_many::<String>(options::SETS) .get_many::<String>(options::SETS)
.map(|v| { .into_iter()
v.map(ToString::to_string) .flatten()
.map(|input| convert::reduce_octal_to_char(&input)) .map(ToOwned::to_owned)
.collect::<Vec<_>>() .collect();
})
.unwrap_or_default();
let sets_len = sets.len(); let sets_len = sets.len();
if sets.is_empty() { if sets.is_empty() {
@ -80,8 +80,8 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
let mut sets_iter = sets.iter().map(|c| c.as_str()); let mut sets_iter = sets.iter().map(|c| c.as_str());
let (set1, set2) = Sequence::solve_set_characters( let (set1, set2) = Sequence::solve_set_characters(
sets_iter.next().unwrap_or_default(), sets_iter.next().unwrap_or_default().as_bytes(),
sets_iter.next().unwrap_or_default(), sets_iter.next().unwrap_or_default().as_bytes(),
truncate_set1_flag, truncate_set1_flag,
)?; )?;

View file

@ -3,13 +3,13 @@
// For the full copyright and license information, please view the LICENSE // For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code. // file that was distributed with this source code.
pub static BEL: char = '\u{0007}'; pub static BEL: u8 = 0x7;
pub static BS: char = '\u{0008}'; pub static BS: u8 = 0x8;
pub static HT: char = '\u{0009}'; pub static HT: u8 = 0x9;
pub static LF: char = '\u{000A}'; pub static LF: u8 = 0xA;
pub static VT: char = '\u{000B}'; pub static VT: u8 = 0xB;
pub static FF: char = '\u{000C}'; pub static FF: u8 = 0xC;
pub static CR: char = '\u{000D}'; pub static CR: u8 = 0xD;
pub static SPACE: char = '\u{0020}'; pub static SPACE: u8 = 0x20;
pub static SPACES: &[char] = &[HT, LF, VT, FF, CR, SPACE]; pub static SPACES: &[u8] = &[HT, LF, VT, FF, CR, SPACE];
pub static BLANK: &[char] = &[SPACE, HT]; pub static BLANK: &[u8] = &[SPACE, HT];

View file

@ -29,12 +29,12 @@ fn test_small_set2() {
} }
#[test] #[test]
fn test_unicode() { fn test_invalid_unicode() {
new_ucmd!() new_ucmd!()
.args(&[", ┬─┬", "╯︵┻━┻"]) .args(&["-dc", "abc"])
.pipe_in("(,°□°), ┬─┬") .pipe_in([0o200, b'a', b'b', b'c'])
.run() .succeeds()
.stdout_is("(╯°□°)╯︵┻━┻"); .stdout_is("abc");
} }
#[test] #[test]
@ -733,10 +733,11 @@ fn check_against_gnu_tr_tests_w() {
// {IN=>"\300\301\377\345\345\350\345"}, // {IN=>"\300\301\377\345\345\350\345"},
// {OUT=>"\300\301\377\345"}], // {OUT=>"\300\301\377\345"}],
new_ucmd!() new_ucmd!()
.args(&["-ds", "\u{350}", "\u{345}"]) .arg("-ds")
.pipe_in("\u{300}\u{301}\u{377}\u{345}\u{345}\u{350}\u{345}") .args(&["\\350", "\\345"])
.pipe_in([0o300, 0o301, 0o377, 0o345, 0o345, 0o350, 0o345])
.succeeds() .succeeds()
.stdout_is("\u{300}\u{301}\u{377}\u{345}"); .stdout_is_bytes([0o300, 0o301, 0o377, 0o345]);
} }
#[test] #[test]