1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-27 19:17:43 +00:00

printf: add error handling to escaped unicode characters

This commit is contained in:
Joseph Jon Booker 2025-04-05 21:36:40 -05:00
parent b10aa47e38
commit ef7a8c300e
4 changed files with 128 additions and 28 deletions

View file

@ -5,6 +5,8 @@
//! Parsing of escape sequences
use crate::format::FormatError;
#[derive(Debug)]
pub enum EscapedChar {
/// A single byte
@ -90,34 +92,36 @@ fn parse_code(input: &mut &[u8], base: Base) -> Option<u8> {
// spell-checker:disable-next
/// Parse `\uHHHH` and `\UHHHHHHHH`
// TODO: This should print warnings and possibly halt execution when it fails to parse
// TODO: If the character cannot be converted to u32, the input should be printed.
fn parse_unicode(input: &mut &[u8], digits: u8) -> Option<char> {
let (c, rest) = input.split_first()?;
let mut ret = Base::Hex.convert_digit(*c)? as u32;
*input = rest;
for _ in 1..digits {
let (c, rest) = input.split_first()?;
let n = Base::Hex.convert_digit(*c)?;
ret = ret
.wrapping_mul(Base::Hex.as_base() as u32)
.wrapping_add(n as u32);
fn parse_unicode(input: &mut &[u8], digits: u8) -> Result<char, EscapeError> {
if let Some((new_digits, rest)) = input.split_at_checked(digits as usize) {
*input = rest;
let ret = new_digits
.iter()
.map(|c| Base::Hex.convert_digit(*c))
.collect::<Option<Vec<u8>>>()
.ok_or(EscapeError::MissingHexadecimalNumber)?
.iter()
.map(|n| *n as u32)
.reduce(|ret, n| ret.wrapping_mul(Base::Hex.as_base() as u32).wrapping_add(n))
.expect("must have multiple digits in unicode string");
char::from_u32(ret).ok_or_else(|| EscapeError::InvalidCharacters(new_digits.to_vec()))
} else {
Err(EscapeError::MissingHexadecimalNumber)
}
char::from_u32(ret)
}
/// Represents an invalid escape sequence.
#[derive(Debug)]
pub struct EscapeError {}
#[derive(Debug, PartialEq)]
pub enum EscapeError {
InvalidCharacters(Vec<u8>),
MissingHexadecimalNumber,
}
/// Parse an escape sequence, like `\n` or `\xff`, etc.
pub fn parse_escape_code(
rest: &mut &[u8],
zero_octal_parsing: OctalParsing,
) -> Result<EscapedChar, EscapeError> {
) -> Result<EscapedChar, FormatError> {
if let [c, new_rest @ ..] = rest {
// This is for the \NNN syntax for octal sequences.
// Note that '0' is intentionally omitted because that
@ -145,17 +149,89 @@ pub fn parse_escape_code(
if let Some(c) = parse_code(rest, Base::Hex) {
Ok(EscapedChar::Byte(c))
} else {
Err(EscapeError {})
Err(FormatError::MissingHex)
}
}
b'0' => Ok(EscapedChar::Byte(
parse_code(rest, Base::Oct(zero_octal_parsing)).unwrap_or(b'\0'),
)),
b'u' => Ok(EscapedChar::Char(parse_unicode(rest, 4).unwrap_or('\0'))),
b'U' => Ok(EscapedChar::Char(parse_unicode(rest, 8).unwrap_or('\0'))),
b'u' => match parse_unicode(rest, 4) {
Ok(c) => Ok(EscapedChar::Char(c)),
Err(EscapeError::MissingHexadecimalNumber) => Err(FormatError::MissingHex),
Err(EscapeError::InvalidCharacters(chars)) => {
Err(FormatError::InvalidCharacter('u', chars))
}
},
b'U' => match parse_unicode(rest, 8) {
Ok(c) => Ok(EscapedChar::Char(c)),
Err(EscapeError::MissingHexadecimalNumber) => Err(FormatError::MissingHex),
Err(EscapeError::InvalidCharacters(chars)) => {
Err(FormatError::InvalidCharacter('U', chars))
}
},
c => Ok(EscapedChar::Backslash(*c)),
}
} else {
Ok(EscapedChar::Byte(b'\\'))
}
}
#[cfg(test)]
mod tests {
use super::*;
mod parse_unicode {
use super::*;
#[test]
fn parse_ascii() {
let input = b"2a";
assert_eq!(parse_unicode(&mut &input[..], 2), Ok('*'));
let input = b"002A";
assert_eq!(parse_unicode(&mut &input[..], 4), Ok('*'));
}
#[test]
fn parse_emoji_codepoint() {
let input = b"0001F60A";
assert_eq!(parse_unicode(&mut &input[..], 8), Ok('😊'));
}
#[test]
fn no_characters() {
let input = b"";
assert_eq!(
parse_unicode(&mut &input[..], 8),
Err(EscapeError::MissingHexadecimalNumber)
);
}
#[test]
fn incomplete_hexadecimal_number() {
let input = b"123";
assert_eq!(
parse_unicode(&mut &input[..], 4),
Err(EscapeError::MissingHexadecimalNumber)
);
}
#[test]
fn invalid_hex() {
let input = b"duck";
assert_eq!(
parse_unicode(&mut &input[..], 4),
Err(EscapeError::MissingHexadecimalNumber)
);
}
#[test]
fn surrogate_code_point() {
let input = b"d800";
assert_eq!(
parse_unicode(&mut &input[..], 4),
Err(EscapeError::InvalidCharacters(Vec::from(b"d800")))
);
}
}
}

View file

@ -71,6 +71,9 @@ pub enum FormatError {
EndsWithPercent(Vec<u8>),
/// The escape sequence `\x` appears without a literal hexadecimal value.
MissingHex,
/// The hexadecimal characters represent a code point that cannot represent a
/// Unicode character (e.g., a surrogate code point)
InvalidCharacter(char, Vec<u8>),
}
impl Error for FormatError {}
@ -110,6 +113,12 @@ impl Display for FormatError {
Self::NoMoreArguments => write!(f, "no more arguments"),
Self::InvalidArgument(_) => write!(f, "invalid argument"),
Self::MissingHex => write!(f, "missing hexadecimal number in escape"),
Self::InvalidCharacter(escape_char, digits) => write!(
f,
"invalid universal character name \\{}{}",
escape_char,
String::from_utf8_lossy(digits)
),
}
}
}
@ -186,12 +195,7 @@ pub fn parse_spec_and_escape(
}
[b'\\', rest @ ..] => {
current = rest;
Some(
match parse_escape_code(&mut current, OctalParsing::default()) {
Ok(c) => Ok(FormatItem::Char(c)),
Err(_) => Err(FormatError::MissingHex),
},
)
Some(parse_escape_code(&mut current, OctalParsing::default()).map(FormatItem::Char))
}
[c, rest @ ..] => {
current = rest;

View file

@ -502,7 +502,7 @@ fn parse(
let ebd_result = construct_extended_big_decimal(digits, negative, base, scale, exponent);
// Return what has been parsed so far. It there are extra characters, mark the
// Return what has been parsed so far. If there are extra characters, mark the
// parsing as a partial match.
if let Some((first_unparsed, _)) = chars.next() {
Err(ExtendedParserError::PartialMatch(

View file

@ -112,6 +112,26 @@ fn escaped_unicode_null_byte() {
.stdout_is_bytes([1u8, b'_']);
}
#[test]
fn escaped_unicode_incomplete() {
for arg in ["\\u", "\\U", "\\uabc", "\\Uabcd"] {
new_ucmd!()
.arg(arg)
.fails_with_code(1)
.stderr_only("printf: missing hexadecimal number in escape\n");
}
}
#[test]
fn escaped_unicode_invalid() {
for arg in ["\\ud9d0", "\\U0000D8F9"] {
new_ucmd!().arg(arg).fails_with_code(1).stderr_only(format!(
"printf: invalid universal character name {}\n",
arg
));
}
}
#[test]
fn escaped_percent_sign() {
new_ucmd!()