1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-28 03:27:44 +00:00

printf: add error handling to escaped unicode characters

This commit is contained in:
Joseph Jon Booker 2025-04-05 21:36:40 -05:00
parent b10aa47e38
commit ef7a8c300e
4 changed files with 128 additions and 28 deletions

View file

@ -5,6 +5,8 @@
//! Parsing of escape sequences //! Parsing of escape sequences
use crate::format::FormatError;
#[derive(Debug)] #[derive(Debug)]
pub enum EscapedChar { pub enum EscapedChar {
/// A single byte /// A single byte
@ -90,34 +92,36 @@ fn parse_code(input: &mut &[u8], base: Base) -> Option<u8> {
// spell-checker:disable-next // spell-checker:disable-next
/// Parse `\uHHHH` and `\UHHHHHHHH` /// Parse `\uHHHH` and `\UHHHHHHHH`
// TODO: This should print warnings and possibly halt execution when it fails to parse fn parse_unicode(input: &mut &[u8], digits: u8) -> Result<char, EscapeError> {
// TODO: If the character cannot be converted to u32, the input should be printed. if let Some((new_digits, rest)) = input.split_at_checked(digits as usize) {
fn parse_unicode(input: &mut &[u8], digits: u8) -> Option<char> {
let (c, rest) = input.split_first()?;
let mut ret = Base::Hex.convert_digit(*c)? as u32;
*input = rest;
for _ in 1..digits {
let (c, rest) = input.split_first()?;
let n = Base::Hex.convert_digit(*c)?;
ret = ret
.wrapping_mul(Base::Hex.as_base() as u32)
.wrapping_add(n as u32);
*input = rest; *input = rest;
let ret = new_digits
.iter()
.map(|c| Base::Hex.convert_digit(*c))
.collect::<Option<Vec<u8>>>()
.ok_or(EscapeError::MissingHexadecimalNumber)?
.iter()
.map(|n| *n as u32)
.reduce(|ret, n| ret.wrapping_mul(Base::Hex.as_base() as u32).wrapping_add(n))
.expect("must have multiple digits in unicode string");
char::from_u32(ret).ok_or_else(|| EscapeError::InvalidCharacters(new_digits.to_vec()))
} else {
Err(EscapeError::MissingHexadecimalNumber)
} }
char::from_u32(ret)
} }
/// Represents an invalid escape sequence. /// Represents an invalid escape sequence.
#[derive(Debug)] #[derive(Debug, PartialEq)]
pub struct EscapeError {} pub enum EscapeError {
InvalidCharacters(Vec<u8>),
MissingHexadecimalNumber,
}
/// Parse an escape sequence, like `\n` or `\xff`, etc. /// Parse an escape sequence, like `\n` or `\xff`, etc.
pub fn parse_escape_code( pub fn parse_escape_code(
rest: &mut &[u8], rest: &mut &[u8],
zero_octal_parsing: OctalParsing, zero_octal_parsing: OctalParsing,
) -> Result<EscapedChar, EscapeError> { ) -> Result<EscapedChar, FormatError> {
if let [c, new_rest @ ..] = rest { if let [c, new_rest @ ..] = rest {
// This is for the \NNN syntax for octal sequences. // This is for the \NNN syntax for octal sequences.
// Note that '0' is intentionally omitted because that // Note that '0' is intentionally omitted because that
@ -145,17 +149,89 @@ pub fn parse_escape_code(
if let Some(c) = parse_code(rest, Base::Hex) { if let Some(c) = parse_code(rest, Base::Hex) {
Ok(EscapedChar::Byte(c)) Ok(EscapedChar::Byte(c))
} else { } else {
Err(EscapeError {}) Err(FormatError::MissingHex)
} }
} }
b'0' => Ok(EscapedChar::Byte( b'0' => Ok(EscapedChar::Byte(
parse_code(rest, Base::Oct(zero_octal_parsing)).unwrap_or(b'\0'), parse_code(rest, Base::Oct(zero_octal_parsing)).unwrap_or(b'\0'),
)), )),
b'u' => Ok(EscapedChar::Char(parse_unicode(rest, 4).unwrap_or('\0'))), b'u' => match parse_unicode(rest, 4) {
b'U' => Ok(EscapedChar::Char(parse_unicode(rest, 8).unwrap_or('\0'))), Ok(c) => Ok(EscapedChar::Char(c)),
Err(EscapeError::MissingHexadecimalNumber) => Err(FormatError::MissingHex),
Err(EscapeError::InvalidCharacters(chars)) => {
Err(FormatError::InvalidCharacter('u', chars))
}
},
b'U' => match parse_unicode(rest, 8) {
Ok(c) => Ok(EscapedChar::Char(c)),
Err(EscapeError::MissingHexadecimalNumber) => Err(FormatError::MissingHex),
Err(EscapeError::InvalidCharacters(chars)) => {
Err(FormatError::InvalidCharacter('U', chars))
}
},
c => Ok(EscapedChar::Backslash(*c)), c => Ok(EscapedChar::Backslash(*c)),
} }
} else { } else {
Ok(EscapedChar::Byte(b'\\')) Ok(EscapedChar::Byte(b'\\'))
} }
} }
#[cfg(test)]
mod tests {
use super::*;
mod parse_unicode {
use super::*;
#[test]
fn parse_ascii() {
let input = b"2a";
assert_eq!(parse_unicode(&mut &input[..], 2), Ok('*'));
let input = b"002A";
assert_eq!(parse_unicode(&mut &input[..], 4), Ok('*'));
}
#[test]
fn parse_emoji_codepoint() {
let input = b"0001F60A";
assert_eq!(parse_unicode(&mut &input[..], 8), Ok('😊'));
}
#[test]
fn no_characters() {
let input = b"";
assert_eq!(
parse_unicode(&mut &input[..], 8),
Err(EscapeError::MissingHexadecimalNumber)
);
}
#[test]
fn incomplete_hexadecimal_number() {
let input = b"123";
assert_eq!(
parse_unicode(&mut &input[..], 4),
Err(EscapeError::MissingHexadecimalNumber)
);
}
#[test]
fn invalid_hex() {
let input = b"duck";
assert_eq!(
parse_unicode(&mut &input[..], 4),
Err(EscapeError::MissingHexadecimalNumber)
);
}
#[test]
fn surrogate_code_point() {
let input = b"d800";
assert_eq!(
parse_unicode(&mut &input[..], 4),
Err(EscapeError::InvalidCharacters(Vec::from(b"d800")))
);
}
}
}

View file

@ -71,6 +71,9 @@ pub enum FormatError {
EndsWithPercent(Vec<u8>), EndsWithPercent(Vec<u8>),
/// The escape sequence `\x` appears without a literal hexadecimal value. /// The escape sequence `\x` appears without a literal hexadecimal value.
MissingHex, MissingHex,
/// The hexadecimal characters represent a code point that cannot represent a
/// Unicode character (e.g., a surrogate code point)
InvalidCharacter(char, Vec<u8>),
} }
impl Error for FormatError {} impl Error for FormatError {}
@ -110,6 +113,12 @@ impl Display for FormatError {
Self::NoMoreArguments => write!(f, "no more arguments"), Self::NoMoreArguments => write!(f, "no more arguments"),
Self::InvalidArgument(_) => write!(f, "invalid argument"), Self::InvalidArgument(_) => write!(f, "invalid argument"),
Self::MissingHex => write!(f, "missing hexadecimal number in escape"), Self::MissingHex => write!(f, "missing hexadecimal number in escape"),
Self::InvalidCharacter(escape_char, digits) => write!(
f,
"invalid universal character name \\{}{}",
escape_char,
String::from_utf8_lossy(digits)
),
} }
} }
} }
@ -186,12 +195,7 @@ pub fn parse_spec_and_escape(
} }
[b'\\', rest @ ..] => { [b'\\', rest @ ..] => {
current = rest; current = rest;
Some( Some(parse_escape_code(&mut current, OctalParsing::default()).map(FormatItem::Char))
match parse_escape_code(&mut current, OctalParsing::default()) {
Ok(c) => Ok(FormatItem::Char(c)),
Err(_) => Err(FormatError::MissingHex),
},
)
} }
[c, rest @ ..] => { [c, rest @ ..] => {
current = rest; current = rest;

View file

@ -502,7 +502,7 @@ fn parse(
let ebd_result = construct_extended_big_decimal(digits, negative, base, scale, exponent); let ebd_result = construct_extended_big_decimal(digits, negative, base, scale, exponent);
// Return what has been parsed so far. It there are extra characters, mark the // Return what has been parsed so far. If there are extra characters, mark the
// parsing as a partial match. // parsing as a partial match.
if let Some((first_unparsed, _)) = chars.next() { if let Some((first_unparsed, _)) = chars.next() {
Err(ExtendedParserError::PartialMatch( Err(ExtendedParserError::PartialMatch(

View file

@ -112,6 +112,26 @@ fn escaped_unicode_null_byte() {
.stdout_is_bytes([1u8, b'_']); .stdout_is_bytes([1u8, b'_']);
} }
#[test]
fn escaped_unicode_incomplete() {
for arg in ["\\u", "\\U", "\\uabc", "\\Uabcd"] {
new_ucmd!()
.arg(arg)
.fails_with_code(1)
.stderr_only("printf: missing hexadecimal number in escape\n");
}
}
#[test]
fn escaped_unicode_invalid() {
for arg in ["\\ud9d0", "\\U0000D8F9"] {
new_ucmd!().arg(arg).fails_with_code(1).stderr_only(format!(
"printf: invalid universal character name {}\n",
arg
));
}
}
#[test] #[test]
fn escaped_percent_sign() { fn escaped_percent_sign() {
new_ucmd!() new_ucmd!()