1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-28 03:27:44 +00:00

tr: correctly handle multibyte octal sequences

This commit is contained in:
Andrew Liebenow 2024-10-12 00:53:54 -05:00
parent 654581ac77
commit a696e609eb
2 changed files with 60 additions and 5 deletions

View file

@ -3,8 +3,9 @@
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.
// spell-checker:ignore (strings) anychar combinator Alnum Punct Xdigit alnum punct xdigit cntrl boop
// spell-checker:ignore (strings) anychar combinator Alnum Punct Xdigit alnum punct xdigit cntrl
use crate::unicode_table;
use nom::{
branch::alt,
bytes::complete::{tag, take},
@ -23,8 +24,6 @@ use std::{
};
use uucore::error::UError;
use crate::unicode_table;
#[derive(Debug, Clone)]
pub enum BadSequence {
MissingCharClassName,
@ -285,9 +284,52 @@ impl Sequence {
}
fn parse_octal(input: &[u8]) -> IResult<&[u8], u8> {
preceded(
tag("\\"),
alt((
Self::parse_octal_up_to_three_digits,
// Fallback for if the three digit octal escape is greater than \377 (0xFF), and therefore can't be
// parsed as as a byte
// See test `test_multibyte_octal_sequence`
Self::parse_octal_two_digits,
)),
)(input)
}
fn parse_octal_up_to_three_digits(input: &[u8]) -> IResult<&[u8], u8> {
map_opt(
preceded(tag("\\"), recognize(many_m_n(1, 3, one_of("01234567")))),
|out: &[u8]| u8::from_str_radix(std::str::from_utf8(out).expect("boop"), 8).ok(),
recognize(many_m_n(1, 3, one_of("01234567"))),
|out: &[u8]| {
let str_to_parse = std::str::from_utf8(out).unwrap();
match u8::from_str_radix(str_to_parse, 8) {
Ok(ue) => Some(ue),
Err(_pa) => {
// TODO
// Cannot log here, because this closure is executed multiple times
// let mut last_char = str_to_parse.chars();
// let second_number = last_char.next_back().unwrap();
// let first_number = last_char.as_str();
// show!(USimpleError::new(
// 0,
// format!("warning: the ambiguous octal escape \\{str_to_parse} is being interpreted as the 2-byte sequence \\{first_number}, {second_number}")
// ));
None
}
}
},
)(input)
}
fn parse_octal_two_digits(input: &[u8]) -> IResult<&[u8], u8> {
map_opt(
recognize(many_m_n(2, 2, one_of("01234567"))),
|out: &[u8]| u8::from_str_radix(std::str::from_utf8(out).unwrap(), 8).ok(),
)(input)
}

View file

@ -1487,3 +1487,16 @@ fn test_trailing_backslash() {
.stderr_is("tr: warning: an unescaped backslash at end of string is not portable\n")
.stdout_is("abc");
}
#[test]
fn test_multibyte_octal_sequence() {
new_ucmd!()
.args(&["-d", r"\501"])
.pipe_in("(1Ł)")
.succeeds()
// TODO
// Cannot log warning because of how nom is parsing the arguments
// .stderr_is(r"tr: warning: the ambiguous octal escape \501 is being interpreted as the 2-byte sequence \50, 1
// ")
.stdout_is("Ł)");
}