mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-28 11:37:44 +00:00
tr: correctly handle multibyte octal sequences
This commit is contained in:
parent
654581ac77
commit
a696e609eb
2 changed files with 60 additions and 5 deletions
|
@ -3,8 +3,9 @@
|
||||||
// For the full copyright and license information, please view the LICENSE
|
// For the full copyright and license information, please view the LICENSE
|
||||||
// file that was distributed with this source code.
|
// file that was distributed with this source code.
|
||||||
|
|
||||||
// spell-checker:ignore (strings) anychar combinator Alnum Punct Xdigit alnum punct xdigit cntrl boop
|
// spell-checker:ignore (strings) anychar combinator Alnum Punct Xdigit alnum punct xdigit cntrl
|
||||||
|
|
||||||
|
use crate::unicode_table;
|
||||||
use nom::{
|
use nom::{
|
||||||
branch::alt,
|
branch::alt,
|
||||||
bytes::complete::{tag, take},
|
bytes::complete::{tag, take},
|
||||||
|
@ -23,8 +24,6 @@ use std::{
|
||||||
};
|
};
|
||||||
use uucore::error::UError;
|
use uucore::error::UError;
|
||||||
|
|
||||||
use crate::unicode_table;
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub enum BadSequence {
|
pub enum BadSequence {
|
||||||
MissingCharClassName,
|
MissingCharClassName,
|
||||||
|
@ -285,9 +284,52 @@ impl Sequence {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_octal(input: &[u8]) -> IResult<&[u8], u8> {
|
fn parse_octal(input: &[u8]) -> IResult<&[u8], u8> {
|
||||||
|
preceded(
|
||||||
|
tag("\\"),
|
||||||
|
alt((
|
||||||
|
Self::parse_octal_up_to_three_digits,
|
||||||
|
// Fallback for if the three digit octal escape is greater than \377 (0xFF), and therefore can't be
|
||||||
|
// parsed as as a byte
|
||||||
|
// See test `test_multibyte_octal_sequence`
|
||||||
|
Self::parse_octal_two_digits,
|
||||||
|
)),
|
||||||
|
)(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_octal_up_to_three_digits(input: &[u8]) -> IResult<&[u8], u8> {
|
||||||
map_opt(
|
map_opt(
|
||||||
preceded(tag("\\"), recognize(many_m_n(1, 3, one_of("01234567")))),
|
recognize(many_m_n(1, 3, one_of("01234567"))),
|
||||||
|out: &[u8]| u8::from_str_radix(std::str::from_utf8(out).expect("boop"), 8).ok(),
|
|out: &[u8]| {
|
||||||
|
let str_to_parse = std::str::from_utf8(out).unwrap();
|
||||||
|
|
||||||
|
match u8::from_str_radix(str_to_parse, 8) {
|
||||||
|
Ok(ue) => Some(ue),
|
||||||
|
Err(_pa) => {
|
||||||
|
// TODO
|
||||||
|
// Cannot log here, because this closure is executed multiple times
|
||||||
|
|
||||||
|
// let mut last_char = str_to_parse.chars();
|
||||||
|
|
||||||
|
// let second_number = last_char.next_back().unwrap();
|
||||||
|
|
||||||
|
// let first_number = last_char.as_str();
|
||||||
|
|
||||||
|
// show!(USimpleError::new(
|
||||||
|
// 0,
|
||||||
|
// format!("warning: the ambiguous octal escape \\{str_to_parse} is being interpreted as the 2-byte sequence \\{first_number}, {second_number}")
|
||||||
|
// ));
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_octal_two_digits(input: &[u8]) -> IResult<&[u8], u8> {
|
||||||
|
map_opt(
|
||||||
|
recognize(many_m_n(2, 2, one_of("01234567"))),
|
||||||
|
|out: &[u8]| u8::from_str_radix(std::str::from_utf8(out).unwrap(), 8).ok(),
|
||||||
)(input)
|
)(input)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1487,3 +1487,16 @@ fn test_trailing_backslash() {
|
||||||
.stderr_is("tr: warning: an unescaped backslash at end of string is not portable\n")
|
.stderr_is("tr: warning: an unescaped backslash at end of string is not portable\n")
|
||||||
.stdout_is("abc");
|
.stdout_is("abc");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_multibyte_octal_sequence() {
|
||||||
|
new_ucmd!()
|
||||||
|
.args(&["-d", r"\501"])
|
||||||
|
.pipe_in("(1Ł)")
|
||||||
|
.succeeds()
|
||||||
|
// TODO
|
||||||
|
// Cannot log warning because of how nom is parsing the arguments
|
||||||
|
// .stderr_is(r"tr: warning: the ambiguous octal escape \501 is being interpreted as the 2-byte sequence \50, 1
|
||||||
|
// ")
|
||||||
|
.stdout_is("Ł)");
|
||||||
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue