1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-27 19:17:43 +00:00

Merge pull request #6779 from andrewliebenow/tr-multibyte-octal-sequences

tr: correctly handle multibyte octal sequences
This commit is contained in:
Sylvestre Ledru 2024-10-30 08:35:38 +01:00 committed by GitHub
commit 566bca34ad
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 105 additions and 9 deletions

View file

@ -3,12 +3,13 @@
// For the full copyright and license information, please view the LICENSE // For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code. // file that was distributed with this source code.
// spell-checker:ignore (strings) anychar combinator Alnum Punct Xdigit alnum punct xdigit cntrl boop // spell-checker:ignore (strings) anychar combinator Alnum Punct Xdigit alnum punct xdigit cntrl
use crate::unicode_table;
use nom::{ use nom::{
branch::alt, branch::alt,
bytes::complete::{tag, take}, bytes::complete::{tag, take, take_till},
character::complete::{digit1, one_of}, character::complete::one_of,
combinator::{map, map_opt, peek, recognize, value}, combinator::{map, map_opt, peek, recognize, value},
multi::{many0, many_m_n}, multi::{many0, many_m_n},
sequence::{delimited, preceded, separated_pair}, sequence::{delimited, preceded, separated_pair},
@ -23,8 +24,6 @@ use std::{
}; };
use uucore::error::UError; use uucore::error::UError;
use crate::unicode_table;
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub enum BadSequence { pub enum BadSequence {
MissingCharClassName, MissingCharClassName,
@ -37,6 +36,7 @@ pub enum BadSequence {
ClassInSet2NotMatchedBySet1, ClassInSet2NotMatchedBySet1,
Set1LongerSet2EndsInClass, Set1LongerSet2EndsInClass,
ComplementMoreThanOneUniqueInSet2, ComplementMoreThanOneUniqueInSet2,
BackwardsRange { end: u32, start: u32 },
} }
impl Display for BadSequence { impl Display for BadSequence {
@ -70,6 +70,23 @@ impl Display for BadSequence {
Self::ComplementMoreThanOneUniqueInSet2 => { Self::ComplementMoreThanOneUniqueInSet2 => {
write!(f, "when translating with complemented character classes,\nstring2 must map all characters in the domain to one") write!(f, "when translating with complemented character classes,\nstring2 must map all characters in the domain to one")
} }
Self::BackwardsRange { end, start } => {
fn end_or_start_to_string(ut: &u32) -> String {
match char::from_u32(*ut) {
Some(ch @ '\x20'..='\x7E') => ch.escape_default().to_string(),
_ => {
format!("\\{ut:03o}")
}
}
}
write!(
f,
"range-endpoints of '{}-{}' are in reverse collating sequence order",
end_or_start_to_string(start),
end_or_start_to_string(end)
)
}
} }
} }
} }
@ -285,9 +302,41 @@ impl Sequence {
} }
fn parse_octal(input: &[u8]) -> IResult<&[u8], u8> { fn parse_octal(input: &[u8]) -> IResult<&[u8], u8> {
preceded(
tag("\\"),
alt((
Self::parse_octal_up_to_three_digits,
// Fallback for if the three digit octal escape is greater than \377 (0xFF), and therefore can't be
// parsed as as a byte
// See test `test_multibyte_octal_sequence`
Self::parse_octal_two_digits,
)),
)(input)
}
fn parse_octal_up_to_three_digits(input: &[u8]) -> IResult<&[u8], u8> {
map_opt( map_opt(
preceded(tag("\\"), recognize(many_m_n(1, 3, one_of("01234567")))), recognize(many_m_n(1, 3, one_of("01234567"))),
|out: &[u8]| u8::from_str_radix(std::str::from_utf8(out).expect("boop"), 8).ok(), |out: &[u8]| {
let str_to_parse = std::str::from_utf8(out).unwrap();
match u8::from_str_radix(str_to_parse, 8) {
Ok(ue) => Some(ue),
Err(_pa) => {
// TODO
// A warning needs to be printed here
// See https://github.com/uutils/coreutils/issues/6821
None
}
}
},
)(input)
}
fn parse_octal_two_digits(input: &[u8]) -> IResult<&[u8], u8> {
map_opt(
recognize(many_m_n(2, 2, one_of("01234567"))),
|out: &[u8]| u8::from_str_radix(std::str::from_utf8(out).unwrap(), 8).ok(),
)(input) )(input)
} }
@ -324,7 +373,14 @@ impl Sequence {
.map(|(l, (a, b))| { .map(|(l, (a, b))| {
(l, { (l, {
let (start, end) = (u32::from(a), u32::from(b)); let (start, end) = (u32::from(a), u32::from(b));
Ok(Self::CharRange(start as u8, end as u8))
let range = start..=end;
if range.is_empty() {
Err(BadSequence::BackwardsRange { end, start })
} else {
Ok(Self::CharRange(start as u8, end as u8))
}
}) })
}) })
} }
@ -337,7 +393,14 @@ impl Sequence {
fn parse_char_repeat(input: &[u8]) -> IResult<&[u8], Result<Self, BadSequence>> { fn parse_char_repeat(input: &[u8]) -> IResult<&[u8], Result<Self, BadSequence>> {
delimited( delimited(
tag("["), tag("["),
separated_pair(Self::parse_backslash_or_char, tag("*"), digit1), separated_pair(
Self::parse_backslash_or_char,
tag("*"),
// TODO
// Why are the opening and closing tags not sufficient?
// Backslash check is a workaround for `check_against_gnu_tr_tests_repeat_bs_9`
take_till(|ue| matches!(ue, b']' | b'\\')),
),
tag("]"), tag("]"),
)(input) )(input)
.map(|(l, (c, cnt_str))| { .map(|(l, (c, cnt_str))| {

View file

@ -1487,3 +1487,36 @@ fn test_trailing_backslash() {
.stderr_is("tr: warning: an unescaped backslash at end of string is not portable\n") .stderr_is("tr: warning: an unescaped backslash at end of string is not portable\n")
.stdout_is("abc"); .stdout_is("abc");
} }
#[test]
fn test_multibyte_octal_sequence() {
new_ucmd!()
.args(&["-d", r"\501"])
.pipe_in("(1Ł)")
.succeeds()
// TODO
// A warning needs to be printed here
// See https://github.com/uutils/coreutils/issues/6821
.stdout_is("Ł)");
}
#[test]
fn test_backwards_range() {
new_ucmd!()
.args(&["-d", r"\046-\048"])
.pipe_in("")
.fails()
.stderr_only(
r"tr: range-endpoints of '&-\004' are in reverse collating sequence order
",
);
}
#[test]
fn test_non_digit_repeat() {
new_ucmd!()
.args(&["a", "[b*c]"])
.pipe_in("")
.fails()
.stderr_only("tr: invalid repeat count 'c' in [c*n] construct\n");
}