1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-28 19:47:45 +00:00

Merge pull request #5640 from tertsdiepraam/tr-no-output

`tr`: operate on bytes instead of chars
This commit is contained in:
Sylvestre Ledru 2023-12-19 11:42:44 +01:00 committed by GitHub
commit 9920f13a34
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 115 additions and 151 deletions

View file

@ -1,36 +0,0 @@
// This file is part of the uutils coreutils package.
//
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.
// spell-checker:ignore (strings) anychar combinator
use nom::{
branch::alt,
bytes::complete::tag,
character::complete::{anychar, one_of},
combinator::{map_opt, recognize},
multi::{many0, many_m_n},
sequence::preceded,
IResult,
};
fn parse_octal(input: &str) -> IResult<&str, char> {
map_opt(
preceded(tag("\\"), recognize(many_m_n(1, 3, one_of("01234567")))),
|out: &str| {
u32::from_str_radix(out, 8)
.map(std::char::from_u32)
.ok()
.flatten()
},
)(input)
}
pub fn reduce_octal_to_char(input: &str) -> String {
many0(alt((parse_octal, anychar)))(input)
.map(|(_, r)| r)
.unwrap()
.into_iter()
.collect()
}

View file

@ -7,10 +7,10 @@
use nom::{
branch::alt,
bytes::complete::tag,
character::complete::{anychar, digit1},
combinator::{map, peek, value},
multi::many0,
bytes::complete::{tag, take},
character::complete::{digit1, one_of},
combinator::{map, map_opt, peek, recognize, value},
multi::{many0, many_m_n},
sequence::{delimited, preceded, separated_pair},
IResult,
};
@ -62,10 +62,10 @@ impl UError for BadSequence {}
#[derive(Debug, Clone, Copy)]
pub enum Sequence {
Char(char),
CharRange(u32, u32),
CharStar(char),
CharRepeat(char, usize),
Char(u8),
CharRange(u8, u8),
CharStar(u8),
CharRepeat(u8, usize),
Alnum,
Alpha,
Blank,
@ -81,21 +81,17 @@ pub enum Sequence {
}
impl Sequence {
pub fn flatten(&self) -> Box<dyn Iterator<Item = char>> {
pub fn flatten(&self) -> Box<dyn Iterator<Item = u8>> {
match self {
Self::Char(c) => Box::new(std::iter::once(*c)),
Self::CharRange(l, r) => Box::new((*l..=*r).flat_map(std::char::from_u32)),
Self::CharRange(l, r) => Box::new(*l..=*r),
Self::CharStar(c) => Box::new(std::iter::repeat(*c)),
Self::CharRepeat(c, n) => Box::new(std::iter::repeat(*c).take(*n)),
Self::Alnum => Box::new(('0'..='9').chain('A'..='Z').chain('a'..='z')),
Self::Alpha => Box::new(('A'..='Z').chain('a'..='z')),
Self::Alnum => Box::new((b'0'..=b'9').chain(b'A'..=b'Z').chain(b'a'..=b'z')),
Self::Alpha => Box::new((b'A'..=b'Z').chain(b'a'..=b'z')),
Self::Blank => Box::new(unicode_table::BLANK.iter().cloned()),
Self::Control => Box::new(
(0..=31)
.chain(std::iter::once(127))
.flat_map(std::char::from_u32),
),
Self::Digit => Box::new('0'..='9'),
Self::Control => Box::new((0..=31).chain(std::iter::once(127))),
Self::Digit => Box::new(b'0'..=b'9'),
Self::Graph => Box::new(
(48..=57) // digit
.chain(65..=90) // uppercase
@ -105,10 +101,9 @@ impl Sequence {
.chain(58..=64)
.chain(91..=96)
.chain(123..=126)
.chain(std::iter::once(32)) // space
.flat_map(std::char::from_u32),
.chain(std::iter::once(32)), // space
),
Self::Lower => Box::new('a'..='z'),
Self::Lower => Box::new(b'a'..=b'z'),
Self::Print => Box::new(
(48..=57) // digit
.chain(65..=90) // uppercase
@ -117,29 +112,22 @@ impl Sequence {
.chain(33..=47)
.chain(58..=64)
.chain(91..=96)
.chain(123..=126)
.flat_map(std::char::from_u32),
),
Self::Punct => Box::new(
(33..=47)
.chain(58..=64)
.chain(91..=96)
.chain(123..=126)
.flat_map(std::char::from_u32),
.chain(123..=126),
),
Self::Punct => Box::new((33..=47).chain(58..=64).chain(91..=96).chain(123..=126)),
Self::Space => Box::new(unicode_table::SPACES.iter().cloned()),
Self::Upper => Box::new('A'..='Z'),
Self::Xdigit => Box::new(('0'..='9').chain('A'..='F').chain('a'..='f')),
Self::Upper => Box::new(b'A'..=b'Z'),
Self::Xdigit => Box::new((b'0'..=b'9').chain(b'A'..=b'F').chain(b'a'..=b'f')),
}
}
// Hide all the nasty sh*t in here
// TODO: Make the 2 set lazily generate the character mapping as necessary.
pub fn solve_set_characters(
set1_str: &str,
set2_str: &str,
set1_str: &[u8],
set2_str: &[u8],
truncate_set1_flag: bool,
) -> Result<(Vec<char>, Vec<char>), BadSequence> {
) -> Result<(Vec<u8>, Vec<u8>), BadSequence> {
let set1 = Self::from_str(set1_str)?;
let set2 = Self::from_str(set2_str)?;
@ -164,7 +152,7 @@ impl Sequence {
.count();
let star_compensate_len = set1_len.saturating_sub(set2_len);
let (left, right) = (partition.next(), partition.next());
let set2_solved: Vec<char> = match (left, right) {
let set2_solved: Vec<_> = match (left, right) {
(None, None) => match char_star {
Some(c) => std::iter::repeat(*c).take(star_compensate_len).collect(),
None => std::iter::empty().collect(),
@ -201,7 +189,7 @@ impl Sequence {
.collect(),
},
};
let mut set1_solved: Vec<char> = set1.iter().flat_map(Self::flatten).collect();
let mut set1_solved: Vec<_> = set1.iter().flat_map(Self::flatten).collect();
if truncate_set1_flag {
set1_solved.truncate(set2_solved.len());
}
@ -216,7 +204,7 @@ impl Sequence {
}
impl Sequence {
pub fn from_str(input: &str) -> Result<Vec<Self>, BadSequence> {
pub fn from_str(input: &[u8]) -> Result<Vec<Self>, BadSequence> {
many0(alt((
Self::parse_char_range,
Self::parse_char_star,
@ -232,27 +220,38 @@ impl Sequence {
.collect::<Result<Vec<_>, _>>()
}
fn parse_backslash(input: &str) -> IResult<&str, char> {
preceded(tag("\\"), anychar)(input).map(|(l, a)| {
fn parse_octal(input: &[u8]) -> IResult<&[u8], u8> {
map_opt(
preceded(tag("\\"), recognize(many_m_n(1, 3, one_of("01234567")))),
|out: &[u8]| u8::from_str_radix(std::str::from_utf8(out).expect("boop"), 8).ok(),
)(input)
}
fn parse_backslash(input: &[u8]) -> IResult<&[u8], u8> {
preceded(tag("\\"), Self::single_char)(input).map(|(l, a)| {
let c = match a {
'a' => unicode_table::BEL,
'b' => unicode_table::BS,
'f' => unicode_table::FF,
'n' => unicode_table::LF,
'r' => unicode_table::CR,
't' => unicode_table::HT,
'v' => unicode_table::VT,
b'a' => unicode_table::BEL,
b'b' => unicode_table::BS,
b'f' => unicode_table::FF,
b'n' => unicode_table::LF,
b'r' => unicode_table::CR,
b't' => unicode_table::HT,
b'v' => unicode_table::VT,
x => x,
};
(l, c)
})
}
fn parse_backslash_or_char(input: &str) -> IResult<&str, char> {
alt((Self::parse_backslash, anychar))(input)
fn parse_backslash_or_char(input: &[u8]) -> IResult<&[u8], u8> {
alt((Self::parse_octal, Self::parse_backslash, Self::single_char))(input)
}
fn parse_char_range(input: &str) -> IResult<&str, Result<Self, BadSequence>> {
fn single_char(input: &[u8]) -> IResult<&[u8], u8> {
take(1usize)(input).map(|(l, a)| (l, a[0]))
}
fn parse_char_range(input: &[u8]) -> IResult<&[u8], Result<Self, BadSequence>> {
separated_pair(
Self::parse_backslash_or_char,
tag("-"),
@ -261,41 +260,42 @@ impl Sequence {
.map(|(l, (a, b))| {
(l, {
let (start, end) = (u32::from(a), u32::from(b));
Ok(Self::CharRange(start, end))
Ok(Self::CharRange(start as u8, end as u8))
})
})
}
fn parse_char_star(input: &str) -> IResult<&str, Result<Self, BadSequence>> {
fn parse_char_star(input: &[u8]) -> IResult<&[u8], Result<Self, BadSequence>> {
delimited(tag("["), Self::parse_backslash_or_char, tag("*]"))(input)
.map(|(l, a)| (l, Ok(Self::CharStar(a))))
}
fn parse_char_repeat(input: &str) -> IResult<&str, Result<Self, BadSequence>> {
fn parse_char_repeat(input: &[u8]) -> IResult<&[u8], Result<Self, BadSequence>> {
delimited(
tag("["),
separated_pair(Self::parse_backslash_or_char, tag("*"), digit1),
tag("]"),
)(input)
.map(|(l, (c, cnt_str))| {
let result = if cnt_str.starts_with('0') {
match usize::from_str_radix(cnt_str, 8) {
let s = String::from_utf8_lossy(cnt_str);
let result = if cnt_str.starts_with(b"0") {
match usize::from_str_radix(&s, 8) {
Ok(0) => Ok(Self::CharStar(c)),
Ok(count) => Ok(Self::CharRepeat(c, count)),
Err(_) => Err(BadSequence::InvalidRepeatCount(cnt_str.to_string())),
Err(_) => Err(BadSequence::InvalidRepeatCount(s.to_string())),
}
} else {
match cnt_str.parse::<usize>() {
match s.parse::<usize>() {
Ok(0) => Ok(Self::CharStar(c)),
Ok(count) => Ok(Self::CharRepeat(c, count)),
Err(_) => Err(BadSequence::InvalidRepeatCount(cnt_str.to_string())),
Err(_) => Err(BadSequence::InvalidRepeatCount(s.to_string())),
}
};
(l, result)
})
}
fn parse_class(input: &str) -> IResult<&str, Result<Self, BadSequence>> {
fn parse_class(input: &[u8]) -> IResult<&[u8], Result<Self, BadSequence>> {
delimited(
tag("[:"),
alt((
@ -322,7 +322,7 @@ impl Sequence {
)(input)
}
fn parse_char_equal(input: &str) -> IResult<&str, Result<Self, BadSequence>> {
fn parse_char_equal(input: &[u8]) -> IResult<&[u8], Result<Self, BadSequence>> {
delimited(
tag("[="),
alt((
@ -338,17 +338,17 @@ impl Sequence {
}
pub trait SymbolTranslator {
fn translate(&mut self, current: char) -> Option<char>;
fn translate(&mut self, current: u8) -> Option<u8>;
}
#[derive(Debug)]
pub struct DeleteOperation {
set: Vec<char>,
set: Vec<u8>,
complement_flag: bool,
}
impl DeleteOperation {
pub fn new(set: Vec<char>, complement_flag: bool) -> Self {
pub fn new(set: Vec<u8>, complement_flag: bool) -> Self {
Self {
set,
complement_flag,
@ -357,8 +357,8 @@ impl DeleteOperation {
}
impl SymbolTranslator for DeleteOperation {
fn translate(&mut self, current: char) -> Option<char> {
let found = self.set.iter().any(|sequence| sequence.eq(&current));
fn translate(&mut self, current: u8) -> Option<u8> {
let found = self.set.iter().any(|sequence| *sequence == current);
if self.complement_flag == found {
Some(current)
} else {
@ -368,15 +368,15 @@ impl SymbolTranslator for DeleteOperation {
}
pub struct TranslateOperationComplement {
iter: u32,
iter: u8,
set2_iter: usize,
set1: Vec<char>,
set2: Vec<char>,
translation_map: HashMap<char, char>,
set1: Vec<u8>,
set2: Vec<u8>,
translation_map: HashMap<u8, u8>,
}
impl TranslateOperationComplement {
fn new(set1: Vec<char>, set2: Vec<char>) -> Self {
fn new(set1: Vec<u8>, set2: Vec<u8>) -> Self {
Self {
iter: 0,
set2_iter: 0,
@ -389,11 +389,11 @@ impl TranslateOperationComplement {
#[derive(Debug)]
pub struct TranslateOperationStandard {
translation_map: HashMap<char, char>,
translation_map: HashMap<u8, u8>,
}
impl TranslateOperationStandard {
fn new(set1: Vec<char>, set2: Vec<char>) -> Result<Self, BadSequence> {
fn new(set1: Vec<u8>, set2: Vec<u8>) -> Result<Self, BadSequence> {
if let Some(fallback) = set2.last().copied() {
Ok(Self {
translation_map: set1
@ -417,18 +417,17 @@ pub enum TranslateOperation {
}
impl TranslateOperation {
fn next_complement_char(iter: u32, ignore_list: &[char]) -> (u32, char) {
fn next_complement_char(iter: u8, ignore_list: &[u8]) -> (u8, u8) {
(iter..)
.filter_map(std::char::from_u32)
.filter(|c| !ignore_list.iter().any(|s| s.eq(c)))
.map(|c| (u32::from(c) + 1, c))
.filter(|c| !ignore_list.iter().any(|s| s == c))
.map(|c| (c + 1, c))
.next()
.expect("exhausted all possible characters")
}
}
impl TranslateOperation {
pub fn new(set1: Vec<char>, set2: Vec<char>, complement: bool) -> Result<Self, BadSequence> {
pub fn new(set1: Vec<u8>, set2: Vec<u8>, complement: bool) -> Result<Self, BadSequence> {
if complement {
Ok(Self::Complement(TranslateOperationComplement::new(
set1, set2,
@ -440,7 +439,7 @@ impl TranslateOperation {
}
impl SymbolTranslator for TranslateOperation {
fn translate(&mut self, current: char) -> Option<char> {
fn translate(&mut self, current: u8) -> Option<u8> {
match self {
Self::Standard(TranslateOperationStandard { translation_map }) => Some(
translation_map
@ -482,13 +481,13 @@ impl SymbolTranslator for TranslateOperation {
#[derive(Debug, Clone)]
pub struct SqueezeOperation {
set1: HashSet<char>,
set1: HashSet<u8>,
complement: bool,
previous: Option<char>,
previous: Option<u8>,
}
impl SqueezeOperation {
pub fn new(set1: Vec<char>, complement: bool) -> Self {
pub fn new(set1: Vec<u8>, complement: bool) -> Self {
Self {
set1: set1.into_iter().collect(),
complement,
@ -498,7 +497,7 @@ impl SqueezeOperation {
}
impl SymbolTranslator for SqueezeOperation {
fn translate(&mut self, current: char) -> Option<char> {
fn translate(&mut self, current: u8) -> Option<u8> {
if self.complement {
let next = if self.set1.contains(&current) {
Some(current)
@ -537,15 +536,15 @@ where
R: BufRead,
W: Write,
{
let mut buf = String::new();
let mut output_buf = String::new();
while let Ok(length) = input.read_line(&mut buf) {
let mut buf = Vec::new();
let mut output_buf = Vec::new();
while let Ok(length) = input.read_until(b'\n', &mut buf) {
if length == 0 {
break;
} else {
let filtered = buf.chars().filter_map(|c| translator.translate(c));
let filtered = buf.iter().filter_map(|c| translator.translate(*c));
output_buf.extend(filtered);
output.write_all(output_buf.as_bytes()).unwrap();
output.write_all(&output_buf).unwrap();
}
buf.clear();
output_buf.clear();

View file

@ -5,7 +5,6 @@
// spell-checker:ignore (ToDO) allocs bset dflag cflag sflag tflag
mod convert;
mod operation;
mod unicode_table;
@ -40,14 +39,15 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
let squeeze_flag = matches.get_flag(options::SQUEEZE);
let truncate_set1_flag = matches.get_flag(options::TRUNCATE_SET1);
let sets = matches
// Ultimately this should be OsString, but we might want to wait for the
// pattern API on OsStr
let sets: Vec<_> = matches
.get_many::<String>(options::SETS)
.map(|v| {
v.map(ToString::to_string)
.map(|input| convert::reduce_octal_to_char(&input))
.collect::<Vec<_>>()
})
.unwrap_or_default();
.into_iter()
.flatten()
.map(ToOwned::to_owned)
.collect();
let sets_len = sets.len();
if sets.is_empty() {
@ -78,8 +78,8 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
let mut sets_iter = sets.iter().map(|c| c.as_str());
let (set1, set2) = Sequence::solve_set_characters(
sets_iter.next().unwrap_or_default(),
sets_iter.next().unwrap_or_default(),
sets_iter.next().unwrap_or_default().as_bytes(),
sets_iter.next().unwrap_or_default().as_bytes(),
truncate_set1_flag,
)?;

View file

@ -3,13 +3,13 @@
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.
pub static BEL: char = '\u{0007}';
pub static BS: char = '\u{0008}';
pub static HT: char = '\u{0009}';
pub static LF: char = '\u{000A}';
pub static VT: char = '\u{000B}';
pub static FF: char = '\u{000C}';
pub static CR: char = '\u{000D}';
pub static SPACE: char = '\u{0020}';
pub static SPACES: &[char] = &[HT, LF, VT, FF, CR, SPACE];
pub static BLANK: &[char] = &[SPACE, HT];
pub static BEL: u8 = 0x7;
pub static BS: u8 = 0x8;
pub static HT: u8 = 0x9;
pub static LF: u8 = 0xA;
pub static VT: u8 = 0xB;
pub static FF: u8 = 0xC;
pub static CR: u8 = 0xD;
pub static SPACE: u8 = 0x20;
pub static SPACES: &[u8] = &[HT, LF, VT, FF, CR, SPACE];
pub static BLANK: &[u8] = &[SPACE, HT];

View file

@ -29,12 +29,12 @@ fn test_small_set2() {
}
#[test]
fn test_unicode() {
fn test_invalid_unicode() {
new_ucmd!()
.args(&[", ┬─┬", "╯︵┻━┻"])
.pipe_in("(,°□°), ┬─┬")
.run()
.stdout_is("(╯°□°)╯︵┻━┻");
.args(&["-dc", "abc"])
.pipe_in([0o200, b'a', b'b', b'c'])
.succeeds()
.stdout_is("abc");
}
#[test]
@ -733,10 +733,11 @@ fn check_against_gnu_tr_tests_w() {
// {IN=>"\300\301\377\345\345\350\345"},
// {OUT=>"\300\301\377\345"}],
new_ucmd!()
.args(&["-ds", "\u{350}", "\u{345}"])
.pipe_in("\u{300}\u{301}\u{377}\u{345}\u{345}\u{350}\u{345}")
.arg("-ds")
.args(&["\\350", "\\345"])
.pipe_in([0o300, 0o301, 0o377, 0o345, 0o345, 0o350, 0o345])
.succeeds()
.stdout_is("\u{300}\u{301}\u{377}\u{345}");
.stdout_is_bytes([0o300, 0o301, 0o377, 0o345]);
}
#[test]