mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-27 11:07:44 +00:00
Merge pull request #6779 from andrewliebenow/tr-multibyte-octal-sequences
tr: correctly handle multibyte octal sequences
This commit is contained in:
commit
566bca34ad
2 changed files with 105 additions and 9 deletions
|
@ -3,12 +3,13 @@
|
||||||
// For the full copyright and license information, please view the LICENSE
|
// For the full copyright and license information, please view the LICENSE
|
||||||
// file that was distributed with this source code.
|
// file that was distributed with this source code.
|
||||||
|
|
||||||
// spell-checker:ignore (strings) anychar combinator Alnum Punct Xdigit alnum punct xdigit cntrl boop
|
// spell-checker:ignore (strings) anychar combinator Alnum Punct Xdigit alnum punct xdigit cntrl
|
||||||
|
|
||||||
|
use crate::unicode_table;
|
||||||
use nom::{
|
use nom::{
|
||||||
branch::alt,
|
branch::alt,
|
||||||
bytes::complete::{tag, take},
|
bytes::complete::{tag, take, take_till},
|
||||||
character::complete::{digit1, one_of},
|
character::complete::one_of,
|
||||||
combinator::{map, map_opt, peek, recognize, value},
|
combinator::{map, map_opt, peek, recognize, value},
|
||||||
multi::{many0, many_m_n},
|
multi::{many0, many_m_n},
|
||||||
sequence::{delimited, preceded, separated_pair},
|
sequence::{delimited, preceded, separated_pair},
|
||||||
|
@ -23,8 +24,6 @@ use std::{
|
||||||
};
|
};
|
||||||
use uucore::error::UError;
|
use uucore::error::UError;
|
||||||
|
|
||||||
use crate::unicode_table;
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub enum BadSequence {
|
pub enum BadSequence {
|
||||||
MissingCharClassName,
|
MissingCharClassName,
|
||||||
|
@ -37,6 +36,7 @@ pub enum BadSequence {
|
||||||
ClassInSet2NotMatchedBySet1,
|
ClassInSet2NotMatchedBySet1,
|
||||||
Set1LongerSet2EndsInClass,
|
Set1LongerSet2EndsInClass,
|
||||||
ComplementMoreThanOneUniqueInSet2,
|
ComplementMoreThanOneUniqueInSet2,
|
||||||
|
BackwardsRange { end: u32, start: u32 },
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Display for BadSequence {
|
impl Display for BadSequence {
|
||||||
|
@ -70,6 +70,23 @@ impl Display for BadSequence {
|
||||||
Self::ComplementMoreThanOneUniqueInSet2 => {
|
Self::ComplementMoreThanOneUniqueInSet2 => {
|
||||||
write!(f, "when translating with complemented character classes,\nstring2 must map all characters in the domain to one")
|
write!(f, "when translating with complemented character classes,\nstring2 must map all characters in the domain to one")
|
||||||
}
|
}
|
||||||
|
Self::BackwardsRange { end, start } => {
|
||||||
|
fn end_or_start_to_string(ut: &u32) -> String {
|
||||||
|
match char::from_u32(*ut) {
|
||||||
|
Some(ch @ '\x20'..='\x7E') => ch.escape_default().to_string(),
|
||||||
|
_ => {
|
||||||
|
format!("\\{ut:03o}")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
write!(
|
||||||
|
f,
|
||||||
|
"range-endpoints of '{}-{}' are in reverse collating sequence order",
|
||||||
|
end_or_start_to_string(start),
|
||||||
|
end_or_start_to_string(end)
|
||||||
|
)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -285,9 +302,41 @@ impl Sequence {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_octal(input: &[u8]) -> IResult<&[u8], u8> {
|
fn parse_octal(input: &[u8]) -> IResult<&[u8], u8> {
|
||||||
|
preceded(
|
||||||
|
tag("\\"),
|
||||||
|
alt((
|
||||||
|
Self::parse_octal_up_to_three_digits,
|
||||||
|
// Fallback for if the three digit octal escape is greater than \377 (0xFF), and therefore can't be
|
||||||
|
// parsed as as a byte
|
||||||
|
// See test `test_multibyte_octal_sequence`
|
||||||
|
Self::parse_octal_two_digits,
|
||||||
|
)),
|
||||||
|
)(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_octal_up_to_three_digits(input: &[u8]) -> IResult<&[u8], u8> {
|
||||||
map_opt(
|
map_opt(
|
||||||
preceded(tag("\\"), recognize(many_m_n(1, 3, one_of("01234567")))),
|
recognize(many_m_n(1, 3, one_of("01234567"))),
|
||||||
|out: &[u8]| u8::from_str_radix(std::str::from_utf8(out).expect("boop"), 8).ok(),
|
|out: &[u8]| {
|
||||||
|
let str_to_parse = std::str::from_utf8(out).unwrap();
|
||||||
|
|
||||||
|
match u8::from_str_radix(str_to_parse, 8) {
|
||||||
|
Ok(ue) => Some(ue),
|
||||||
|
Err(_pa) => {
|
||||||
|
// TODO
|
||||||
|
// A warning needs to be printed here
|
||||||
|
// See https://github.com/uutils/coreutils/issues/6821
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_octal_two_digits(input: &[u8]) -> IResult<&[u8], u8> {
|
||||||
|
map_opt(
|
||||||
|
recognize(many_m_n(2, 2, one_of("01234567"))),
|
||||||
|
|out: &[u8]| u8::from_str_radix(std::str::from_utf8(out).unwrap(), 8).ok(),
|
||||||
)(input)
|
)(input)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -324,7 +373,14 @@ impl Sequence {
|
||||||
.map(|(l, (a, b))| {
|
.map(|(l, (a, b))| {
|
||||||
(l, {
|
(l, {
|
||||||
let (start, end) = (u32::from(a), u32::from(b));
|
let (start, end) = (u32::from(a), u32::from(b));
|
||||||
Ok(Self::CharRange(start as u8, end as u8))
|
|
||||||
|
let range = start..=end;
|
||||||
|
|
||||||
|
if range.is_empty() {
|
||||||
|
Err(BadSequence::BackwardsRange { end, start })
|
||||||
|
} else {
|
||||||
|
Ok(Self::CharRange(start as u8, end as u8))
|
||||||
|
}
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -337,7 +393,14 @@ impl Sequence {
|
||||||
fn parse_char_repeat(input: &[u8]) -> IResult<&[u8], Result<Self, BadSequence>> {
|
fn parse_char_repeat(input: &[u8]) -> IResult<&[u8], Result<Self, BadSequence>> {
|
||||||
delimited(
|
delimited(
|
||||||
tag("["),
|
tag("["),
|
||||||
separated_pair(Self::parse_backslash_or_char, tag("*"), digit1),
|
separated_pair(
|
||||||
|
Self::parse_backslash_or_char,
|
||||||
|
tag("*"),
|
||||||
|
// TODO
|
||||||
|
// Why are the opening and closing tags not sufficient?
|
||||||
|
// Backslash check is a workaround for `check_against_gnu_tr_tests_repeat_bs_9`
|
||||||
|
take_till(|ue| matches!(ue, b']' | b'\\')),
|
||||||
|
),
|
||||||
tag("]"),
|
tag("]"),
|
||||||
)(input)
|
)(input)
|
||||||
.map(|(l, (c, cnt_str))| {
|
.map(|(l, (c, cnt_str))| {
|
||||||
|
|
|
@ -1487,3 +1487,36 @@ fn test_trailing_backslash() {
|
||||||
.stderr_is("tr: warning: an unescaped backslash at end of string is not portable\n")
|
.stderr_is("tr: warning: an unescaped backslash at end of string is not portable\n")
|
||||||
.stdout_is("abc");
|
.stdout_is("abc");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_multibyte_octal_sequence() {
|
||||||
|
new_ucmd!()
|
||||||
|
.args(&["-d", r"\501"])
|
||||||
|
.pipe_in("(1Ł)")
|
||||||
|
.succeeds()
|
||||||
|
// TODO
|
||||||
|
// A warning needs to be printed here
|
||||||
|
// See https://github.com/uutils/coreutils/issues/6821
|
||||||
|
.stdout_is("Ł)");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_backwards_range() {
|
||||||
|
new_ucmd!()
|
||||||
|
.args(&["-d", r"\046-\048"])
|
||||||
|
.pipe_in("")
|
||||||
|
.fails()
|
||||||
|
.stderr_only(
|
||||||
|
r"tr: range-endpoints of '&-\004' are in reverse collating sequence order
|
||||||
|
",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_non_digit_repeat() {
|
||||||
|
new_ucmd!()
|
||||||
|
.args(&["a", "[b*c]"])
|
||||||
|
.pipe_in("")
|
||||||
|
.fails()
|
||||||
|
.stderr_only("tr: invalid repeat count 'c' in [c*n] construct\n");
|
||||||
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue