From a696e609eb8014963f32e7266397dd649194b6bb Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Sat, 12 Oct 2024 00:53:54 -0500 Subject: [PATCH 1/4] tr: correctly handle multibyte octal sequences --- src/uu/tr/src/operation.rs | 52 ++++++++++++++++++++++++++++++++++---- tests/by-util/test_tr.rs | 13 ++++++++++ 2 files changed, 60 insertions(+), 5 deletions(-) diff --git a/src/uu/tr/src/operation.rs b/src/uu/tr/src/operation.rs index 3eedb5ba9..e86768c4f 100644 --- a/src/uu/tr/src/operation.rs +++ b/src/uu/tr/src/operation.rs @@ -3,8 +3,9 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// spell-checker:ignore (strings) anychar combinator Alnum Punct Xdigit alnum punct xdigit cntrl boop +// spell-checker:ignore (strings) anychar combinator Alnum Punct Xdigit alnum punct xdigit cntrl +use crate::unicode_table; use nom::{ branch::alt, bytes::complete::{tag, take}, @@ -23,8 +24,6 @@ use std::{ }; use uucore::error::UError; -use crate::unicode_table; - #[derive(Debug, Clone)] pub enum BadSequence { MissingCharClassName, @@ -285,9 +284,52 @@ impl Sequence { } fn parse_octal(input: &[u8]) -> IResult<&[u8], u8> { + preceded( + tag("\\"), + alt(( + Self::parse_octal_up_to_three_digits, + // Fallback for if the three digit octal escape is greater than \377 (0xFF), and therefore can't be + // parsed as as a byte + // See test `test_multibyte_octal_sequence` + Self::parse_octal_two_digits, + )), + )(input) + } + + fn parse_octal_up_to_three_digits(input: &[u8]) -> IResult<&[u8], u8> { map_opt( - preceded(tag("\\"), recognize(many_m_n(1, 3, one_of("01234567")))), - |out: &[u8]| u8::from_str_radix(std::str::from_utf8(out).expect("boop"), 8).ok(), + recognize(many_m_n(1, 3, one_of("01234567"))), + |out: &[u8]| { + let str_to_parse = std::str::from_utf8(out).unwrap(); + + match u8::from_str_radix(str_to_parse, 8) { + Ok(ue) => Some(ue), + Err(_pa) => { + // TODO + // Cannot log here, because this closure is executed multiple times + + // let mut last_char = str_to_parse.chars(); + + // let second_number = last_char.next_back().unwrap(); + + // let first_number = last_char.as_str(); + + // show!(USimpleError::new( + // 0, + // format!("warning: the ambiguous octal escape \\{str_to_parse} is being interpreted as the 2-byte sequence \\{first_number}, {second_number}") + // )); + + None + } + } + }, + )(input) + } + + fn parse_octal_two_digits(input: &[u8]) -> IResult<&[u8], u8> { + map_opt( + recognize(many_m_n(2, 2, one_of("01234567"))), + |out: &[u8]| u8::from_str_radix(std::str::from_utf8(out).unwrap(), 8).ok(), )(input) } diff --git a/tests/by-util/test_tr.rs b/tests/by-util/test_tr.rs index b956511f1..c3b52758c 100644 --- a/tests/by-util/test_tr.rs +++ b/tests/by-util/test_tr.rs @@ -1487,3 +1487,16 @@ fn test_trailing_backslash() { .stderr_is("tr: warning: an unescaped backslash at end of string is not portable\n") .stdout_is("abc"); } + +#[test] +fn test_multibyte_octal_sequence() { + new_ucmd!() + .args(&["-d", r"\501"]) + .pipe_in("(1Ł)") + .succeeds() + // TODO + // Cannot log warning because of how nom is parsing the arguments + // .stderr_is(r"tr: warning: the ambiguous octal escape \501 is being interpreted as the 2-byte sequence \50, 1 + // ") + .stdout_is("Ł)"); +} From 0bf5a68c54dfa7616004bf503ed6e0968099330f Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Sat, 12 Oct 2024 01:40:05 -0500 Subject: [PATCH 2/4] tr: forbid backwards ranges --- src/uu/tr/src/operation.rs | 27 ++++++++++++++++++++++++++- tests/by-util/test_tr.rs | 12 ++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/src/uu/tr/src/operation.rs b/src/uu/tr/src/operation.rs index e86768c4f..5c17b7ab4 100644 --- a/src/uu/tr/src/operation.rs +++ b/src/uu/tr/src/operation.rs @@ -36,6 +36,7 @@ pub enum BadSequence { ClassInSet2NotMatchedBySet1, Set1LongerSet2EndsInClass, ComplementMoreThanOneUniqueInSet2, + BackwardsRange { end: u32, start: u32 }, } impl Display for BadSequence { @@ -69,6 +70,23 @@ impl Display for BadSequence { Self::ComplementMoreThanOneUniqueInSet2 => { write!(f, "when translating with complemented character classes,\nstring2 must map all characters in the domain to one") } + Self::BackwardsRange { end, start } => { + fn end_or_start_to_string(ut: &u32) -> String { + match char::from_u32(*ut) { + Some(ch @ '\x20'..='\x7E') => ch.escape_default().to_string(), + _ => { + format!("\\{ut:03o}") + } + } + } + + write!( + f, + "range-endpoints of '{}-{}' are in reverse collating sequence order", + end_or_start_to_string(start), + end_or_start_to_string(end) + ) + } } } } @@ -366,7 +384,14 @@ impl Sequence { .map(|(l, (a, b))| { (l, { let (start, end) = (u32::from(a), u32::from(b)); - Ok(Self::CharRange(start as u8, end as u8)) + + let range = start..=end; + + if range.is_empty() { + Err(BadSequence::BackwardsRange { end, start }) + } else { + Ok(Self::CharRange(start as u8, end as u8)) + } }) }) } diff --git a/tests/by-util/test_tr.rs b/tests/by-util/test_tr.rs index c3b52758c..4cf883bac 100644 --- a/tests/by-util/test_tr.rs +++ b/tests/by-util/test_tr.rs @@ -1500,3 +1500,15 @@ fn test_multibyte_octal_sequence() { // ") .stdout_is("Ł)"); } + +#[test] +fn test_backwards_range() { + new_ucmd!() + .args(&["-d", r"\046-\048"]) + .pipe_in("") + .fails() + .stderr_only( + r"tr: range-endpoints of '&-\004' are in reverse collating sequence order +", + ); +} From e53dd5ffe141b3f77657e2267c63833ff1192dda Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Sat, 12 Oct 2024 03:44:46 -0500 Subject: [PATCH 3/4] tr: forbid non-numeric repeat counts --- src/uu/tr/src/operation.rs | 13 ++++++++++--- tests/by-util/test_tr.rs | 9 +++++++++ 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/src/uu/tr/src/operation.rs b/src/uu/tr/src/operation.rs index 5c17b7ab4..9bb5f9983 100644 --- a/src/uu/tr/src/operation.rs +++ b/src/uu/tr/src/operation.rs @@ -8,8 +8,8 @@ use crate::unicode_table; use nom::{ branch::alt, - bytes::complete::{tag, take}, - character::complete::{digit1, one_of}, + bytes::complete::{tag, take, take_till}, + character::complete::one_of, combinator::{map, map_opt, peek, recognize, value}, multi::{many0, many_m_n}, sequence::{delimited, preceded, separated_pair}, @@ -404,7 +404,14 @@ impl Sequence { fn parse_char_repeat(input: &[u8]) -> IResult<&[u8], Result> { delimited( tag("["), - separated_pair(Self::parse_backslash_or_char, tag("*"), digit1), + separated_pair( + Self::parse_backslash_or_char, + tag("*"), + // TODO + // Why are the opening and closing tags not sufficient? + // Backslash check is a workaround for `check_against_gnu_tr_tests_repeat_bs_9` + take_till(|ue| matches!(ue, b']' | b'\\')), + ), tag("]"), )(input) .map(|(l, (c, cnt_str))| { diff --git a/tests/by-util/test_tr.rs b/tests/by-util/test_tr.rs index 4cf883bac..ed529f407 100644 --- a/tests/by-util/test_tr.rs +++ b/tests/by-util/test_tr.rs @@ -1512,3 +1512,12 @@ fn test_backwards_range() { ", ); } + +#[test] +fn test_non_digit_repeat() { + new_ucmd!() + .args(&["a", "[b*c]"]) + .pipe_in("") + .fails() + .stderr_only("tr: invalid repeat count 'c' in [c*n] construct\n"); +} From 186d749e94905a3f98271513be48e6772c5112fd Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Thu, 24 Oct 2024 13:12:38 -0500 Subject: [PATCH 4/4] Replace commented-out code with TODO comments --- src/uu/tr/src/operation.rs | 15 ++------------- tests/by-util/test_tr.rs | 5 ++--- 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/src/uu/tr/src/operation.rs b/src/uu/tr/src/operation.rs index 9bb5f9983..fc01a8360 100644 --- a/src/uu/tr/src/operation.rs +++ b/src/uu/tr/src/operation.rs @@ -324,19 +324,8 @@ impl Sequence { Ok(ue) => Some(ue), Err(_pa) => { // TODO - // Cannot log here, because this closure is executed multiple times - - // let mut last_char = str_to_parse.chars(); - - // let second_number = last_char.next_back().unwrap(); - - // let first_number = last_char.as_str(); - - // show!(USimpleError::new( - // 0, - // format!("warning: the ambiguous octal escape \\{str_to_parse} is being interpreted as the 2-byte sequence \\{first_number}, {second_number}") - // )); - + // A warning needs to be printed here + // See https://github.com/uutils/coreutils/issues/6821 None } } diff --git a/tests/by-util/test_tr.rs b/tests/by-util/test_tr.rs index ed529f407..ebd7635e4 100644 --- a/tests/by-util/test_tr.rs +++ b/tests/by-util/test_tr.rs @@ -1495,9 +1495,8 @@ fn test_multibyte_octal_sequence() { .pipe_in("(1Ł)") .succeeds() // TODO - // Cannot log warning because of how nom is parsing the arguments - // .stderr_is(r"tr: warning: the ambiguous octal escape \501 is being interpreted as the 2-byte sequence \50, 1 - // ") + // A warning needs to be printed here + // See https://github.com/uutils/coreutils/issues/6821 .stdout_is("Ł)"); }