1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-28 03:27:44 +00:00

echo: handle multibyte escape sequences (#6803)

* echo: handle multibyte escape sequences

Bug was reported, with root cause analysis, by kkew3
Added tests were derived from test cases provided by kkew3
See https://github.com/uutils/coreutils/issues/6741

* Use concrete type

* Fix MSRV issue

* Fix non-UTF-8 argument handling

* Fix MSRV issue

* Fix Clippy violation

* Fix compiler warning

* Address PR comments

* Add MSRV TODO comments

* echo: use stdout_only_bytes instead of stdout_is_bytes

---------

Co-authored-by: Daniel Hofstetter <daniel.hofstetter@42dh.com>
This commit is contained in:
Andrew Liebenow 2024-10-22 04:03:08 -05:00 committed by GitHub
parent 99fa11ac5c
commit 66f11c4ce4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 375 additions and 80 deletions

View file

@ -3,12 +3,15 @@
// For the full copyright and license information, please view the LICENSE // For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code. // file that was distributed with this source code.
use clap::builder::ValueParser;
use clap::parser::ValuesRef;
use clap::{crate_version, Arg, ArgAction, Command}; use clap::{crate_version, Arg, ArgAction, Command};
use std::io::{self, Write}; use std::ffi::{OsStr, OsString};
use std::io::{self, StdoutLock, Write};
use std::iter::Peekable; use std::iter::Peekable;
use std::ops::ControlFlow; use std::ops::ControlFlow;
use std::str::Chars; use std::slice::Iter;
use uucore::error::{FromIo, UResult}; use uucore::error::{UResult, USimpleError};
use uucore::{format_usage, help_about, help_section, help_usage}; use uucore::{format_usage, help_about, help_section, help_usage};
const ABOUT: &str = help_about!("echo.md"); const ABOUT: &str = help_about!("echo.md");
@ -22,94 +25,249 @@ mod options {
pub const DISABLE_BACKSLASH_ESCAPE: &str = "disable_backslash_escape"; pub const DISABLE_BACKSLASH_ESCAPE: &str = "disable_backslash_escape";
} }
#[repr(u8)] enum BackslashNumberType {
#[derive(Clone, Copy)] OctalStartingWithNonZero(u8),
OctalStartingWithZero,
Hexadecimal,
}
impl BackslashNumberType {
fn base(&self) -> Base {
match self {
BackslashNumberType::OctalStartingWithZero
| BackslashNumberType::OctalStartingWithNonZero(_) => Base::Octal,
BackslashNumberType::Hexadecimal => Base::Hexadecimal,
}
}
}
enum Base { enum Base {
Oct = 8, Octal,
Hex = 16, Hexadecimal,
} }
impl Base { impl Base {
fn max_digits(&self) -> u8 { fn ascii_to_number(&self, digit: u8) -> Option<u8> {
fn octal_ascii_digit_to_number(digit: u8) -> Option<u8> {
let number = match digit {
b'0' => 0,
b'1' => 1,
b'2' => 2,
b'3' => 3,
b'4' => 4,
b'5' => 5,
b'6' => 6,
b'7' => 7,
_ => {
return None;
}
};
Some(number)
}
fn hexadecimal_ascii_digit_to_number(digit: u8) -> Option<u8> {
let number = match digit {
b'0' => 0,
b'1' => 1,
b'2' => 2,
b'3' => 3,
b'4' => 4,
b'5' => 5,
b'6' => 6,
b'7' => 7,
b'8' => 8,
b'9' => 9,
b'A' | b'a' => 10,
b'B' | b'b' => 11,
b'C' | b'c' => 12,
b'D' | b'd' => 13,
b'E' | b'e' => 14,
b'F' | b'f' => 15,
_ => {
return None;
}
};
Some(number)
}
match self { match self {
Self::Oct => 3, Self::Octal => octal_ascii_digit_to_number(digit),
Self::Hex => 2, Self::Hexadecimal => hexadecimal_ascii_digit_to_number(digit),
}
}
fn maximum_number_of_digits(&self) -> u8 {
match self {
Self::Octal => 3,
Self::Hexadecimal => 2,
}
}
fn radix(&self) -> u8 {
match self {
Self::Octal => 8,
Self::Hexadecimal => 16,
} }
} }
} }
/// Parse the numeric part of the `\xHHH` and `\0NNN` escape sequences /// Parse the numeric part of `\xHHH`, `\0NNN`, and `\NNN` escape sequences
fn parse_code(input: &mut Peekable<Chars>, base: Base) -> Option<char> { fn parse_backslash_number(
// All arithmetic on `ret` needs to be wrapping, because octal input can input: &mut Peekable<Iter<u8>>,
// take 3 digits, which is 9 bits, and therefore more than what fits in a backslash_number_type: BackslashNumberType,
// `u8`. GNU just seems to wrap these values. ) -> Option<u8> {
// Note that if we instead make `ret` a `u32` and use `char::from_u32` will let first_digit_ascii = match backslash_number_type {
// yield incorrect results because it will interpret values larger than BackslashNumberType::OctalStartingWithZero | BackslashNumberType::Hexadecimal => {
// `u8::MAX` as unicode. match input.peek() {
let mut ret = input.peek().and_then(|c| c.to_digit(base as u32))? as u8; Some(&&digit_ascii) => digit_ascii,
None => {
// We can safely ignore the None case because we just peeked it. // One of the following cases: argument ends with "\0" or "\x"
let _ = input.next(); // If "\0" (octal): caller will print not ASCII '0', 0x30, but ASCII '\0' (NUL), 0x00
// If "\x" (hexadecimal): caller will print literal "\x"
for _ in 1..base.max_digits() { return None;
match input.peek().and_then(|c| c.to_digit(base as u32)) { }
Some(n) => ret = ret.wrapping_mul(base as u8).wrapping_add(n as u8), }
None => break, }
// Never returns early when backslash number starts with "\1" through "\7", because caller provides the
// first digit
BackslashNumberType::OctalStartingWithNonZero(digit_ascii) => digit_ascii,
};
let base = backslash_number_type.base();
let first_digit_number = match base.ascii_to_number(first_digit_ascii) {
Some(digit_number) => {
// Move past byte, since it was successfully parsed
let _ = input.next();
digit_number
}
None => {
// The first digit was not a valid octal or hexadecimal digit
// This should never be the case when the backslash number starts with "\1" through "\7"
// (caller unwraps to verify this)
return None;
}
};
let radix = base.radix();
let mut sum = first_digit_number;
for _ in 1..(base.maximum_number_of_digits()) {
match input
.peek()
.and_then(|&&digit_ascii| base.ascii_to_number(digit_ascii))
{
Some(digit_number) => {
// Move past byte, since it was successfully parsed
let _ = input.next();
// All arithmetic on `sum` needs to be wrapping, because octal input can
// take 3 digits, which is 9 bits, and therefore more than what fits in a
// `u8`.
//
// GNU Core Utilities: "if nnn is a nine-bit value, the ninth bit is ignored"
// https://www.gnu.org/software/coreutils/manual/html_node/echo-invocation.html
sum = sum.wrapping_mul(radix).wrapping_add(digit_number);
}
None => {
break;
}
} }
// We can safely ignore the None case because we just peeked it.
let _ = input.next();
} }
Some(ret.into()) Some(sum)
} }
fn print_escaped(input: &str, mut output: impl Write) -> io::Result<ControlFlow<()>> { fn print_escaped(input: &[u8], output: &mut StdoutLock) -> io::Result<ControlFlow<()>> {
let mut iter = input.chars().peekable(); let mut iter = input.iter().peekable();
while let Some(c) = iter.next() {
if c != '\\' { while let Some(&current_byte) = iter.next() {
write!(output, "{c}")?; if current_byte != b'\\' {
output.write_all(&[current_byte])?;
continue; continue;
} }
// This is for the \NNN syntax for octal sequences. // This is for the \NNN syntax for octal sequences
// Note that '0' is intentionally omitted because that // Note that '0' is intentionally omitted, because the \0NNN syntax is handled below
// would be the \0NNN syntax. if let Some(&&first_digit @ b'1'..=b'7') = iter.peek() {
if let Some('1'..='8') = iter.peek() { // Unwrap because anything starting with "\1" through "\7" can be successfully parsed
if let Some(parsed) = parse_code(&mut iter, Base::Oct) { let parsed_octal_number = parse_backslash_number(
write!(output, "{parsed}")?; &mut iter,
continue; BackslashNumberType::OctalStartingWithNonZero(first_digit),
} )
.unwrap();
output.write_all(&[parsed_octal_number])?;
continue;
} }
if let Some(next) = iter.next() { if let Some(next) = iter.next() {
let unescaped = match next { // For extending lifetime
'\\' => '\\', // Unnecessary when using Rust >= 1.79.0
'a' => '\x07', // https://github.com/rust-lang/rust/pull/121346
'b' => '\x08', // TODO: when we have a MSRV >= 1.79.0, delete these "hold" bindings
'c' => return Ok(ControlFlow::Break(())), let hold_one_byte_outside_of_match: [u8; 1_usize];
'e' => '\x1b', let hold_two_bytes_outside_of_match: [u8; 2_usize];
'f' => '\x0c',
'n' => '\n', let unescaped: &[u8] = match *next {
'r' => '\r', b'\\' => br"\",
't' => '\t', b'a' => b"\x07",
'v' => '\x0b', b'b' => b"\x08",
'x' => { b'c' => return Ok(ControlFlow::Break(())),
if let Some(c) = parse_code(&mut iter, Base::Hex) { b'e' => b"\x1B",
c b'f' => b"\x0C",
b'n' => b"\n",
b'r' => b"\r",
b't' => b"\t",
b'v' => b"\x0B",
b'x' => {
if let Some(parsed_hexadecimal_number) =
parse_backslash_number(&mut iter, BackslashNumberType::Hexadecimal)
{
// TODO: remove when we have a MSRV >= 1.79.0
hold_one_byte_outside_of_match = [parsed_hexadecimal_number];
// TODO: when we have a MSRV >= 1.79.0, return reference to a temporary array:
// &[parsed_hexadecimal_number]
&hold_one_byte_outside_of_match
} else { } else {
write!(output, "\\")?; // "\x" with any non-hexadecimal digit after means "\x" is treated literally
'x' br"\x"
} }
} }
'0' => parse_code(&mut iter, Base::Oct).unwrap_or('\0'), b'0' => {
c => { if let Some(parsed_octal_number) = parse_backslash_number(
write!(output, "\\")?; &mut iter,
c BackslashNumberType::OctalStartingWithZero,
) {
// TODO: remove when we have a MSRV >= 1.79.0
hold_one_byte_outside_of_match = [parsed_octal_number];
// TODO: when we have a MSRV >= 1.79.0, return reference to a temporary array:
// &[parsed_octal_number]
&hold_one_byte_outside_of_match
} else {
// "\0" with any non-octal digit after it means "\0" is treated as ASCII '\0' (NUL), 0x00
b"\0"
}
}
other_byte => {
// Backslash and the following byte are treated literally
hold_two_bytes_outside_of_match = [b'\\', other_byte];
&hold_two_bytes_outside_of_match
} }
}; };
write!(output, "{unescaped}")?;
output.write_all(unescaped)?;
} else { } else {
write!(output, "\\")?; output.write_all(br"\")?;
} }
} }
@ -120,15 +278,33 @@ fn print_escaped(input: &str, mut output: impl Write) -> io::Result<ControlFlow<
pub fn uumain(args: impl uucore::Args) -> UResult<()> { pub fn uumain(args: impl uucore::Args) -> UResult<()> {
let matches = uu_app().get_matches_from(args); let matches = uu_app().get_matches_from(args);
// TODO
// "If the POSIXLY_CORRECT environment variable is set, then when echos first argument is not -n it outputs option-like arguments instead of treating them as options."
// https://www.gnu.org/software/coreutils/manual/html_node/echo-invocation.html
let no_newline = matches.get_flag(options::NO_NEWLINE); let no_newline = matches.get_flag(options::NO_NEWLINE);
let escaped = matches.get_flag(options::ENABLE_BACKSLASH_ESCAPE); let escaped = matches.get_flag(options::ENABLE_BACKSLASH_ESCAPE);
let values: Vec<String> = match matches.get_many::<String>(options::STRING) {
Some(s) => s.map(|s| s.to_string()).collect(),
None => vec![String::new()],
};
execute(no_newline, escaped, &values) let mut stdout_lock = io::stdout().lock();
.map_err_context(|| "could not write to stdout".to_string())
match matches.get_many::<OsString>(options::STRING) {
Some(arguments_after_options) => {
execute(
&mut stdout_lock,
no_newline,
escaped,
arguments_after_options,
)?;
}
None => {
// No strings to print, so just handle newline setting
if !no_newline {
stdout_lock.write_all(b"\n")?;
}
}
}
Ok(())
} }
pub fn uu_app() -> Command { pub fn uu_app() -> Command {
@ -165,29 +341,63 @@ pub fn uu_app() -> Command {
.action(ArgAction::SetTrue) .action(ArgAction::SetTrue)
.overrides_with(options::ENABLE_BACKSLASH_ESCAPE), .overrides_with(options::ENABLE_BACKSLASH_ESCAPE),
) )
.arg(Arg::new(options::STRING).action(ArgAction::Append)) .arg(
Arg::new(options::STRING)
.action(ArgAction::Append)
.value_parser(ValueParser::os_string()),
)
} }
fn execute(no_newline: bool, escaped: bool, free: &[String]) -> io::Result<()> { fn execute(
let stdout = io::stdout(); stdout_lock: &mut StdoutLock,
let mut output = stdout.lock(); no_newline: bool,
escaped: bool,
arguments_after_options: ValuesRef<'_, OsString>,
) -> UResult<()> {
for (i, input) in arguments_after_options.enumerate() {
let Some(bytes) = bytes_from_os_string(input.as_os_str()) else {
return Err(USimpleError::new(
1,
"Non-UTF-8 arguments provided, but this platform does not support them",
));
};
for (i, input) in free.iter().enumerate() {
if i > 0 { if i > 0 {
write!(output, " ")?; stdout_lock.write_all(b" ")?;
} }
if escaped { if escaped {
if print_escaped(input, &mut output)?.is_break() { if print_escaped(bytes, stdout_lock)?.is_break() {
return Ok(()); return Ok(());
} }
} else { } else {
write!(output, "{input}")?; stdout_lock.write_all(bytes)?;
} }
} }
if !no_newline { if !no_newline {
writeln!(output)?; stdout_lock.write_all(b"\n")?;
} }
Ok(()) Ok(())
} }
fn bytes_from_os_string(input: &OsStr) -> Option<&[u8]> {
let option = {
#[cfg(target_family = "unix")]
{
use std::os::unix::ffi::OsStrExt;
Some(input.as_bytes())
}
#[cfg(not(target_family = "unix"))]
{
// TODO
// Verify that this works correctly on these platforms
input.to_str().map(|st| st.as_bytes())
}
};
option
}

View file

@ -303,3 +303,88 @@ fn partial_version_argument() {
fn partial_help_argument() { fn partial_help_argument() {
new_ucmd!().arg("--he").succeeds().stdout_is("--he\n"); new_ucmd!().arg("--he").succeeds().stdout_is("--he\n");
} }
#[test]
fn multibyte_escape_unicode() {
// spell-checker:disable-next-line
// Tests suggested by kkew3
// https://github.com/uutils/coreutils/issues/6741
// \u{1F602} is:
//
// "Face with Tears of Joy"
// U+1F602
// "😂"
new_ucmd!()
.args(&["-e", r"\xf0\x9f\x98\x82"])
.succeeds()
.stdout_only("\u{1F602}\n");
new_ucmd!()
.args(&["-e", r"\x41\xf0\x9f\x98\x82\x42"])
.succeeds()
.stdout_only("A\u{1F602}B\n");
new_ucmd!()
.args(&["-e", r"\xf0\x41\x9f\x98\x82"])
.succeeds()
.stdout_only_bytes(b"\xF0A\x9F\x98\x82\n");
new_ucmd!()
.args(&["-e", r"\x41\xf0\c\x9f\x98\x82"])
.succeeds()
.stdout_only_bytes(b"A\xF0");
}
#[test]
fn non_utf_8_hex_round_trip() {
new_ucmd!()
.args(&["-e", r"\xFF"])
.succeeds()
.stdout_only_bytes(b"\xFF\n");
}
#[test]
fn nine_bit_octal() {
const RESULT: &[u8] = b"\xFF\n";
new_ucmd!()
.args(&["-e", r"\0777"])
.succeeds()
.stdout_only_bytes(RESULT);
new_ucmd!()
.args(&["-e", r"\777"])
.succeeds()
.stdout_only_bytes(RESULT);
}
#[test]
#[cfg(target_family = "unix")]
fn non_utf_8() {
use std::ffi::OsStr;
use std::os::unix::ffi::OsStrExt;
// ISO-8859-1 encoded text
// spell-checker:disable
const INPUT_AND_OUTPUT: &[u8] =
b"Swer an rehte g\xFCete wendet s\xEEn gem\xFCete, dem volget s\xE6lde und \xEAre.";
// spell-checker:enable
let os_str = OsStr::from_bytes(INPUT_AND_OUTPUT);
new_ucmd!()
.arg("-n")
.arg(os_str)
.succeeds()
.stdout_only_bytes(INPUT_AND_OUTPUT);
}
#[test]
fn slash_eight_off_by_one() {
new_ucmd!()
.args(&["-e", "-n", r"\8"])
.succeeds()
.stdout_only(r"\8");
}