mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-27 19:17:43 +00:00
printf: add error handling to escaped unicode characters
This commit is contained in:
parent
b10aa47e38
commit
ef7a8c300e
4 changed files with 128 additions and 28 deletions
|
@ -5,6 +5,8 @@
|
|||
|
||||
//! Parsing of escape sequences
|
||||
|
||||
use crate::format::FormatError;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum EscapedChar {
|
||||
/// A single byte
|
||||
|
@ -90,34 +92,36 @@ fn parse_code(input: &mut &[u8], base: Base) -> Option<u8> {
|
|||
|
||||
// spell-checker:disable-next
|
||||
/// Parse `\uHHHH` and `\UHHHHHHHH`
|
||||
// TODO: This should print warnings and possibly halt execution when it fails to parse
|
||||
// TODO: If the character cannot be converted to u32, the input should be printed.
|
||||
fn parse_unicode(input: &mut &[u8], digits: u8) -> Option<char> {
|
||||
let (c, rest) = input.split_first()?;
|
||||
let mut ret = Base::Hex.convert_digit(*c)? as u32;
|
||||
*input = rest;
|
||||
|
||||
for _ in 1..digits {
|
||||
let (c, rest) = input.split_first()?;
|
||||
let n = Base::Hex.convert_digit(*c)?;
|
||||
ret = ret
|
||||
.wrapping_mul(Base::Hex.as_base() as u32)
|
||||
.wrapping_add(n as u32);
|
||||
fn parse_unicode(input: &mut &[u8], digits: u8) -> Result<char, EscapeError> {
|
||||
if let Some((new_digits, rest)) = input.split_at_checked(digits as usize) {
|
||||
*input = rest;
|
||||
let ret = new_digits
|
||||
.iter()
|
||||
.map(|c| Base::Hex.convert_digit(*c))
|
||||
.collect::<Option<Vec<u8>>>()
|
||||
.ok_or(EscapeError::MissingHexadecimalNumber)?
|
||||
.iter()
|
||||
.map(|n| *n as u32)
|
||||
.reduce(|ret, n| ret.wrapping_mul(Base::Hex.as_base() as u32).wrapping_add(n))
|
||||
.expect("must have multiple digits in unicode string");
|
||||
char::from_u32(ret).ok_or_else(|| EscapeError::InvalidCharacters(new_digits.to_vec()))
|
||||
} else {
|
||||
Err(EscapeError::MissingHexadecimalNumber)
|
||||
}
|
||||
|
||||
char::from_u32(ret)
|
||||
}
|
||||
|
||||
/// Represents an invalid escape sequence.
|
||||
#[derive(Debug)]
|
||||
pub struct EscapeError {}
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum EscapeError {
|
||||
InvalidCharacters(Vec<u8>),
|
||||
MissingHexadecimalNumber,
|
||||
}
|
||||
|
||||
/// Parse an escape sequence, like `\n` or `\xff`, etc.
|
||||
pub fn parse_escape_code(
|
||||
rest: &mut &[u8],
|
||||
zero_octal_parsing: OctalParsing,
|
||||
) -> Result<EscapedChar, EscapeError> {
|
||||
) -> Result<EscapedChar, FormatError> {
|
||||
if let [c, new_rest @ ..] = rest {
|
||||
// This is for the \NNN syntax for octal sequences.
|
||||
// Note that '0' is intentionally omitted because that
|
||||
|
@ -145,17 +149,89 @@ pub fn parse_escape_code(
|
|||
if let Some(c) = parse_code(rest, Base::Hex) {
|
||||
Ok(EscapedChar::Byte(c))
|
||||
} else {
|
||||
Err(EscapeError {})
|
||||
Err(FormatError::MissingHex)
|
||||
}
|
||||
}
|
||||
b'0' => Ok(EscapedChar::Byte(
|
||||
parse_code(rest, Base::Oct(zero_octal_parsing)).unwrap_or(b'\0'),
|
||||
)),
|
||||
b'u' => Ok(EscapedChar::Char(parse_unicode(rest, 4).unwrap_or('\0'))),
|
||||
b'U' => Ok(EscapedChar::Char(parse_unicode(rest, 8).unwrap_or('\0'))),
|
||||
b'u' => match parse_unicode(rest, 4) {
|
||||
Ok(c) => Ok(EscapedChar::Char(c)),
|
||||
Err(EscapeError::MissingHexadecimalNumber) => Err(FormatError::MissingHex),
|
||||
Err(EscapeError::InvalidCharacters(chars)) => {
|
||||
Err(FormatError::InvalidCharacter('u', chars))
|
||||
}
|
||||
},
|
||||
b'U' => match parse_unicode(rest, 8) {
|
||||
Ok(c) => Ok(EscapedChar::Char(c)),
|
||||
Err(EscapeError::MissingHexadecimalNumber) => Err(FormatError::MissingHex),
|
||||
Err(EscapeError::InvalidCharacters(chars)) => {
|
||||
Err(FormatError::InvalidCharacter('U', chars))
|
||||
}
|
||||
},
|
||||
c => Ok(EscapedChar::Backslash(*c)),
|
||||
}
|
||||
} else {
|
||||
Ok(EscapedChar::Byte(b'\\'))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
mod parse_unicode {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn parse_ascii() {
|
||||
let input = b"2a";
|
||||
assert_eq!(parse_unicode(&mut &input[..], 2), Ok('*'));
|
||||
|
||||
let input = b"002A";
|
||||
assert_eq!(parse_unicode(&mut &input[..], 4), Ok('*'));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_emoji_codepoint() {
|
||||
let input = b"0001F60A";
|
||||
assert_eq!(parse_unicode(&mut &input[..], 8), Ok('😊'));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn no_characters() {
|
||||
let input = b"";
|
||||
assert_eq!(
|
||||
parse_unicode(&mut &input[..], 8),
|
||||
Err(EscapeError::MissingHexadecimalNumber)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn incomplete_hexadecimal_number() {
|
||||
let input = b"123";
|
||||
assert_eq!(
|
||||
parse_unicode(&mut &input[..], 4),
|
||||
Err(EscapeError::MissingHexadecimalNumber)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn invalid_hex() {
|
||||
let input = b"duck";
|
||||
assert_eq!(
|
||||
parse_unicode(&mut &input[..], 4),
|
||||
Err(EscapeError::MissingHexadecimalNumber)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn surrogate_code_point() {
|
||||
let input = b"d800";
|
||||
assert_eq!(
|
||||
parse_unicode(&mut &input[..], 4),
|
||||
Err(EscapeError::InvalidCharacters(Vec::from(b"d800")))
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -71,6 +71,9 @@ pub enum FormatError {
|
|||
EndsWithPercent(Vec<u8>),
|
||||
/// The escape sequence `\x` appears without a literal hexadecimal value.
|
||||
MissingHex,
|
||||
/// The hexadecimal characters represent a code point that cannot represent a
|
||||
/// Unicode character (e.g., a surrogate code point)
|
||||
InvalidCharacter(char, Vec<u8>),
|
||||
}
|
||||
|
||||
impl Error for FormatError {}
|
||||
|
@ -110,6 +113,12 @@ impl Display for FormatError {
|
|||
Self::NoMoreArguments => write!(f, "no more arguments"),
|
||||
Self::InvalidArgument(_) => write!(f, "invalid argument"),
|
||||
Self::MissingHex => write!(f, "missing hexadecimal number in escape"),
|
||||
Self::InvalidCharacter(escape_char, digits) => write!(
|
||||
f,
|
||||
"invalid universal character name \\{}{}",
|
||||
escape_char,
|
||||
String::from_utf8_lossy(digits)
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -186,12 +195,7 @@ pub fn parse_spec_and_escape(
|
|||
}
|
||||
[b'\\', rest @ ..] => {
|
||||
current = rest;
|
||||
Some(
|
||||
match parse_escape_code(&mut current, OctalParsing::default()) {
|
||||
Ok(c) => Ok(FormatItem::Char(c)),
|
||||
Err(_) => Err(FormatError::MissingHex),
|
||||
},
|
||||
)
|
||||
Some(parse_escape_code(&mut current, OctalParsing::default()).map(FormatItem::Char))
|
||||
}
|
||||
[c, rest @ ..] => {
|
||||
current = rest;
|
||||
|
|
|
@ -502,7 +502,7 @@ fn parse(
|
|||
|
||||
let ebd_result = construct_extended_big_decimal(digits, negative, base, scale, exponent);
|
||||
|
||||
// Return what has been parsed so far. It there are extra characters, mark the
|
||||
// Return what has been parsed so far. If there are extra characters, mark the
|
||||
// parsing as a partial match.
|
||||
if let Some((first_unparsed, _)) = chars.next() {
|
||||
Err(ExtendedParserError::PartialMatch(
|
||||
|
|
|
@ -112,6 +112,26 @@ fn escaped_unicode_null_byte() {
|
|||
.stdout_is_bytes([1u8, b'_']);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn escaped_unicode_incomplete() {
|
||||
for arg in ["\\u", "\\U", "\\uabc", "\\Uabcd"] {
|
||||
new_ucmd!()
|
||||
.arg(arg)
|
||||
.fails_with_code(1)
|
||||
.stderr_only("printf: missing hexadecimal number in escape\n");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn escaped_unicode_invalid() {
|
||||
for arg in ["\\ud9d0", "\\U0000D8F9"] {
|
||||
new_ucmd!().arg(arg).fails_with_code(1).stderr_only(format!(
|
||||
"printf: invalid universal character name {}\n",
|
||||
arg
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn escaped_percent_sign() {
|
||||
new_ucmd!()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue