From 355103134b76923053b2fa3c5ccda53c0e1108b9 Mon Sep 17 00:00:00 2001 From: Justin Tracey Date: Thu, 21 Nov 2024 22:35:42 -0500 Subject: [PATCH] quoting_style: add support for non-unicode bytes This new functionality is implemented, but not yet exposed here. --- src/uucore/src/lib/features/quoting_style.rs | 512 +++++++++++++++---- 1 file changed, 406 insertions(+), 106 deletions(-) diff --git a/src/uucore/src/lib/features/quoting_style.rs b/src/uucore/src/lib/features/quoting_style.rs index 1efa6f746..0544633bb 100644 --- a/src/uucore/src/lib/features/quoting_style.rs +++ b/src/uucore/src/lib/features/quoting_style.rs @@ -11,34 +11,38 @@ use std::fmt; // These are characters with special meaning in the shell (e.g. bash). // The first const contains characters that only have a special meaning when they appear at the beginning of a name. -const SPECIAL_SHELL_CHARS_START: &[char] = &['~', '#']; +const SPECIAL_SHELL_CHARS_START: &[u8] = b"~#"; // PR#6559 : Remove `]{}` from special shell chars. const SPECIAL_SHELL_CHARS: &str = "`$&*()|[;\\'\"<>?! "; /// The quoting style to use when escaping a name. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum QuotingStyle { - /// Escape the name as a literal string. + /// Escape the name as a shell string. + /// Used in, e.g., `ls --quoting-style=shell`. Shell { /// Whether to escape characters in the name. + /// True in, e.g., `ls --quoting-style=shell-escape`. escape: bool, /// Whether to always quote the name. always_quote: bool, - /// Whether to show control characters. + /// Whether to show control and non-unicode characters, or replace them with `?`. show_control: bool, }, /// Escape the name as a C string. + /// Used in, e.g., `ls --quote-name`. C { /// The type of quotes to use. quotes: Quotes, }, - /// Escape the name as a literal string. + /// Do not escape the string. + /// Used in, e.g., `ls --literal`. Literal { - /// Whether to show control characters. + /// Whether to show control and non-unicode characters, or replace them with `?`. show_control: bool, }, } @@ -72,8 +76,9 @@ enum EscapeState { Octal(EscapeOctal), } +/// Byte we need to present as escaped octal, in the form of `\nnn` struct EscapeOctal { - c: char, + c: u8, state: EscapeOctalState, idx: usize, } @@ -95,20 +100,20 @@ impl Iterator for EscapeOctal { Some('\\') } EscapeOctalState::Value => { - let octal_digit = ((self.c as u32) >> (self.idx * 3)) & 0o7; + let octal_digit = ((self.c) >> (self.idx * 3)) & 0o7; if self.idx == 0 { self.state = EscapeOctalState::Done; } else { self.idx -= 1; } - Some(from_digit(octal_digit, 8).unwrap()) + Some(from_digit(octal_digit.into(), 8).unwrap()) } } } } impl EscapeOctal { - fn from(c: char) -> Self { + fn from(c: u8) -> Self { Self { c, idx: 2, @@ -124,6 +129,12 @@ impl EscapedChar { } } + fn new_octal(b: u8) -> Self { + Self { + state: EscapeState::Octal(EscapeOctal::from(b)), + } + } + fn new_c(c: char, quotes: Quotes, dirname: bool) -> Self { use EscapeState::*; let init_state = match c { @@ -148,7 +159,7 @@ impl EscapedChar { _ => Char(' '), }, ':' if dirname => Backslash(':'), - _ if c.is_ascii_control() => Octal(EscapeOctal::from(c)), + _ if c.is_ascii_control() => Octal(EscapeOctal::from(c as u8)), _ => Char(c), }; Self { state: init_state } @@ -165,7 +176,7 @@ impl EscapedChar { '\x0B' => Backslash('v'), '\x0C' => Backslash('f'), '\r' => Backslash('r'), - '\x00'..='\x1F' | '\x7F' => Octal(EscapeOctal::from(c)), + '\x00'..='\x1F' | '\x7F' => Octal(EscapeOctal::from(c as u8)), '\'' => match quotes { Quotes::Single => Backslash('\''), _ => Char('\''), @@ -205,102 +216,124 @@ impl Iterator for EscapedChar { } } -fn shell_without_escape(name: &str, quotes: Quotes, show_control_chars: bool) -> (String, bool) { +/// Check whether `bytes` starts with any byte in `pattern`. +fn bytes_start_with(bytes: &[u8], pattern: &[u8]) -> bool { + !bytes.is_empty() && pattern.contains(&bytes[0]) +} + +fn shell_without_escape(name: &[u8], quotes: Quotes, show_control_chars: bool) -> (Vec, bool) { let mut must_quote = false; - let mut escaped_str = String::with_capacity(name.len()); + let mut escaped_str = Vec::with_capacity(name.len()); + let mut utf8_buf = vec![0; 4]; - for c in name.chars() { - let escaped = { - let ec = EscapedChar::new_shell(c, false, quotes); - if show_control_chars { - ec - } else { - ec.hide_control() - } - }; + for s in name.utf8_chunks() { + for c in s.valid().chars() { + let escaped = { + let ec = EscapedChar::new_shell(c, false, quotes); + if show_control_chars { + ec + } else { + ec.hide_control() + } + }; - match escaped.state { - EscapeState::Backslash('\'') => escaped_str.push_str("'\\''"), - EscapeState::ForceQuote(x) => { - must_quote = true; - escaped_str.push(x); - } - _ => { - for char in escaped { - escaped_str.push(char); + match escaped.state { + EscapeState::Backslash('\'') => escaped_str.extend_from_slice(b"'\\''"), + EscapeState::ForceQuote(x) => { + must_quote = true; + escaped_str.extend_from_slice(x.encode_utf8(&mut utf8_buf).as_bytes()); + } + _ => { + for c in escaped { + escaped_str.extend_from_slice(c.encode_utf8(&mut utf8_buf).as_bytes()); + } } } } + + if show_control_chars { + escaped_str.extend_from_slice(s.invalid()); + } else { + escaped_str.resize(escaped_str.len() + s.invalid().len(), b'?'); + } } - must_quote = must_quote || name.starts_with(SPECIAL_SHELL_CHARS_START); + must_quote = must_quote || bytes_start_with(name, SPECIAL_SHELL_CHARS_START); (escaped_str, must_quote) } -fn shell_with_escape(name: &str, quotes: Quotes) -> (String, bool) { +fn shell_with_escape(name: &[u8], quotes: Quotes) -> (Vec, bool) { // We need to keep track of whether we are in a dollar expression // because e.g. \b\n is escaped as $'\b\n' and not like $'b'$'n' let mut in_dollar = false; let mut must_quote = false; let mut escaped_str = String::with_capacity(name.len()); - for c in name.chars() { - let escaped = EscapedChar::new_shell(c, true, quotes); - match escaped.state { - EscapeState::Char(x) => { - if in_dollar { - escaped_str.push_str("''"); + for s in name.utf8_chunks() { + for c in s.valid().chars() { + let escaped = EscapedChar::new_shell(c, true, quotes); + match escaped.state { + EscapeState::Char(x) => { + if in_dollar { + escaped_str.push_str("''"); + in_dollar = false; + } + escaped_str.push(x); + } + EscapeState::ForceQuote(x) => { + if in_dollar { + escaped_str.push_str("''"); + in_dollar = false; + } + must_quote = true; + escaped_str.push(x); + } + // Single quotes are not put in dollar expressions, but are escaped + // if the string also contains double quotes. In that case, they must + // be handled separately. + EscapeState::Backslash('\'') => { + must_quote = true; in_dollar = false; + escaped_str.push_str("'\\''"); } - escaped_str.push(x); - } - EscapeState::ForceQuote(x) => { - if in_dollar { - escaped_str.push_str("''"); - in_dollar = false; - } - must_quote = true; - escaped_str.push(x); - } - // Single quotes are not put in dollar expressions, but are escaped - // if the string also contains double quotes. In that case, they must - // be handled separately. - EscapeState::Backslash('\'') => { - must_quote = true; - in_dollar = false; - escaped_str.push_str("'\\''"); - } - _ => { - if !in_dollar { - escaped_str.push_str("'$'"); - in_dollar = true; - } - must_quote = true; - for char in escaped { - escaped_str.push(char); + _ => { + if !in_dollar { + escaped_str.push_str("'$'"); + in_dollar = true; + } + must_quote = true; + for char in escaped { + escaped_str.push(char); + } } } } + if !s.invalid().is_empty() { + if !in_dollar { + escaped_str.push_str("'$'"); + in_dollar = true; + } + must_quote = true; + let escaped_bytes: String = s + .invalid() + .iter() + .flat_map(|b| EscapedChar::new_octal(*b)) + .collect(); + escaped_str.push_str(&escaped_bytes); + } } - must_quote = must_quote || name.starts_with(SPECIAL_SHELL_CHARS_START); - (escaped_str, must_quote) + must_quote = must_quote || bytes_start_with(name, SPECIAL_SHELL_CHARS_START); + (escaped_str.into(), must_quote) } /// Return a set of characters that implies quoting of the word in /// shell-quoting mode. -fn shell_escaped_char_set(is_dirname: bool) -> &'static [char] { - const ESCAPED_CHARS: &[char] = &[ - // the ':' colon character only induce quoting in the - // context of ls displaying a directory name before listing its content. - // (e.g. with the recursive flag -R) - ':', - // Under this line are the control characters that should be - // quoted in shell mode in all cases. - '"', '`', '$', '\\', '^', '\n', '\t', '\r', '=', - ]; - +fn shell_escaped_char_set(is_dirname: bool) -> &'static [u8] { + const ESCAPED_CHARS: &[u8] = b":\"`$\\^\n\t\r="; + // the ':' colon character only induce quoting in the + // context of ls displaying a directory name before listing its content. + // (e.g. with the recursive flag -R) let start_index = if is_dirname { 0 } else { 1 }; - &ESCAPED_CHARS[start_index..] } @@ -308,41 +341,57 @@ fn shell_escaped_char_set(is_dirname: bool) -> &'static [char] { /// /// This inner function provides an additional flag `dirname` which /// is meant for ls' directory name display. -fn escape_name_inner(name: &OsStr, style: &QuotingStyle, dirname: bool) -> String { +fn escape_name_inner(name: &[u8], style: &QuotingStyle, dirname: bool) -> Vec { match style { QuotingStyle::Literal { show_control } => { if *show_control { - name.to_string_lossy().into_owned() + name.to_owned() } else { - name.to_string_lossy() - .chars() - .flat_map(|c| EscapedChar::new_literal(c).hide_control()) - .collect() + name.utf8_chunks() + .map(|s| { + let valid: String = s + .valid() + .chars() + .flat_map(|c| EscapedChar::new_literal(c).hide_control()) + .collect(); + let invalid = "?".repeat(s.invalid().len()); + valid + &invalid + }) + .collect::() + .into() } } QuotingStyle::C { quotes } => { let escaped_str: String = name - .to_string_lossy() - .chars() - .flat_map(|c| EscapedChar::new_c(c, *quotes, dirname)) - .collect(); + .utf8_chunks() + .flat_map(|s| { + let valid = s + .valid() + .chars() + .flat_map(|c| EscapedChar::new_c(c, *quotes, dirname)); + let invalid = s.invalid().iter().flat_map(|b| EscapedChar::new_octal(*b)); + valid.chain(invalid) + }) + .collect::(); match quotes { Quotes::Single => format!("'{escaped_str}'"), Quotes::Double => format!("\"{escaped_str}\""), Quotes::None => escaped_str, } + .into() } QuotingStyle::Shell { escape, always_quote, show_control, } => { - let name = name.to_string_lossy(); - - let (quotes, must_quote) = if name.contains(shell_escaped_char_set(dirname)) { + let (quotes, must_quote) = if name + .iter() + .any(|c| shell_escaped_char_set(dirname).contains(c)) + { (Quotes::Single, true) - } else if name.contains('\'') { + } else if name.contains(&b'\'') { (Quotes::Double, true) } else if *always_quote { (Quotes::Single, true) @@ -351,15 +400,24 @@ fn escape_name_inner(name: &OsStr, style: &QuotingStyle, dirname: bool) -> Strin }; let (escaped_str, contains_quote_chars) = if *escape { - shell_with_escape(&name, quotes) + shell_with_escape(name, quotes) } else { - shell_without_escape(&name, quotes, *show_control) + shell_without_escape(name, quotes, *show_control) }; - match (must_quote | contains_quote_chars, quotes) { - (true, Quotes::Single) => format!("'{escaped_str}'"), - (true, Quotes::Double) => format!("\"{escaped_str}\""), - _ => escaped_str, + if must_quote | contains_quote_chars && quotes != Quotes::None { + let mut quoted_str = Vec::::with_capacity(escaped_str.len() + 2); + let quote = if quotes == Quotes::Single { + b'\'' + } else { + b'"' + }; + quoted_str.push(quote); + quoted_str.extend(escaped_str); + quoted_str.push(quote); + quoted_str + } else { + escaped_str } } } @@ -367,14 +425,16 @@ fn escape_name_inner(name: &OsStr, style: &QuotingStyle, dirname: bool) -> Strin /// Escape a filename with respect to the given style. pub fn escape_name(name: &OsStr, style: &QuotingStyle) -> String { - escape_name_inner(name, style, false) + let name = name.to_string_lossy(); + String::from_utf8_lossy(&escape_name_inner(name.as_bytes(), style, false)).to_string() } /// Escape a directory name with respect to the given style. /// This is mainly meant to be used for ls' directory name printing and is not /// likely to be used elsewhere. pub fn escape_dir_name(dir_name: &OsStr, style: &QuotingStyle) -> String { - escape_name_inner(dir_name, style, true) + let dir_name = dir_name.to_string_lossy(); + String::from_utf8_lossy(&escape_name_inner(dir_name.as_bytes(), style, true)).to_string() } impl fmt::Display for QuotingStyle { @@ -415,7 +475,7 @@ impl fmt::Display for Quotes { #[cfg(test)] mod tests { - use crate::quoting_style::{escape_name, Quotes, QuotingStyle}; + use crate::quoting_style::{escape_name_inner, Quotes, QuotingStyle}; // spell-checker:ignore (tests/words) one\'two one'two @@ -465,14 +525,31 @@ mod tests { } } + fn check_names_inner(name: &[u8], map: &[(T, &str)]) -> Vec> { + map.iter() + .map(|(_, style)| escape_name_inner(name, &get_style(style), false)) + .collect() + } + fn check_names(name: &str, map: &[(&str, &str)]) { assert_eq!( map.iter() - .map(|(_, style)| escape_name(name.as_ref(), &get_style(style))) - .collect::>(), + .map(|(correct, _)| *correct) + .collect::>(), + check_names_inner(name.as_bytes(), map) + .iter() + .map(|bytes| std::str::from_utf8(bytes) + .expect("valid str goes in, valid str comes out")) + .collect::>() + ); + } + + fn check_names_raw(name: &[u8], map: &[(&[u8], &str)]) { + assert_eq!( map.iter() - .map(|(correct, _)| correct.to_string()) - .collect::>() + .map(|(correct, _)| *correct) + .collect::>(), + check_names_inner(name, map) ); } @@ -732,6 +809,229 @@ mod tests { ); } + #[test] + fn test_non_unicode_bytes() { + let ascii = b'_'; + let continuation = b'\xA7'; + let first2byte = b'\xC2'; + let first3byte = b'\xE0'; + let first4byte = b'\xF0'; + let invalid = b'\xC0'; + + // a single byte value invalid outside of additional context in UTF-8 + check_names_raw( + &[continuation], + &[ + (b"?", "literal"), + (b"\xA7", "literal-show"), + (b"\\247", "escape"), + (b"\"\\247\"", "c"), + (b"?", "shell"), + (b"\xA7", "shell-show"), + (b"'?'", "shell-always"), + (b"'\xA7'", "shell-always-show"), + (b"''$'\\247'", "shell-escape"), + (b"''$'\\247'", "shell-escape-always"), + ], + ); + + // ...but the byte becomes valid with appropriate context + // (this is just the ยง character in UTF-8, written as bytes) + check_names_raw( + &[first2byte, continuation], + &[ + (b"\xC2\xA7", "literal"), + (b"\xC2\xA7", "literal-show"), + (b"\xC2\xA7", "escape"), + (b"\"\xC2\xA7\"", "c"), + (b"\xC2\xA7", "shell"), + (b"\xC2\xA7", "shell-show"), + (b"'\xC2\xA7'", "shell-always"), + (b"'\xC2\xA7'", "shell-always-show"), + (b"\xC2\xA7", "shell-escape"), + (b"'\xC2\xA7'", "shell-escape-always"), + ], + ); + + // mixed with valid characters + check_names_raw( + &[continuation, ascii], + &[ + (b"?_", "literal"), + (b"\xA7_", "literal-show"), + (b"\\247_", "escape"), + (b"\"\\247_\"", "c"), + (b"?_", "shell"), + (b"\xA7_", "shell-show"), + (b"'?_'", "shell-always"), + (b"'\xA7_'", "shell-always-show"), + (b"''$'\\247''_'", "shell-escape"), + (b"''$'\\247''_'", "shell-escape-always"), + ], + ); + check_names_raw( + &[ascii, continuation], + &[ + (b"_?", "literal"), + (b"_\xA7", "literal-show"), + (b"_\\247", "escape"), + (b"\"_\\247\"", "c"), + (b"_?", "shell"), + (b"_\xA7", "shell-show"), + (b"'_?'", "shell-always"), + (b"'_\xA7'", "shell-always-show"), + (b"'_'$'\\247'", "shell-escape"), + (b"'_'$'\\247'", "shell-escape-always"), + ], + ); + check_names_raw( + &[ascii, continuation, ascii], + &[ + (b"_?_", "literal"), + (b"_\xA7_", "literal-show"), + (b"_\\247_", "escape"), + (b"\"_\\247_\"", "c"), + (b"_?_", "shell"), + (b"_\xA7_", "shell-show"), + (b"'_?_'", "shell-always"), + (b"'_\xA7_'", "shell-always-show"), + (b"'_'$'\\247''_'", "shell-escape"), + (b"'_'$'\\247''_'", "shell-escape-always"), + ], + ); + check_names_raw( + &[continuation, ascii, continuation], + &[ + (b"?_?", "literal"), + (b"\xA7_\xA7", "literal-show"), + (b"\\247_\\247", "escape"), + (b"\"\\247_\\247\"", "c"), + (b"?_?", "shell"), + (b"\xA7_\xA7", "shell-show"), + (b"'?_?'", "shell-always"), + (b"'\xA7_\xA7'", "shell-always-show"), + (b"''$'\\247''_'$'\\247'", "shell-escape"), + (b"''$'\\247''_'$'\\247'", "shell-escape-always"), + ], + ); + + // contiguous invalid bytes + check_names_raw( + &[ + ascii, + invalid, + ascii, + continuation, + continuation, + ascii, + continuation, + continuation, + continuation, + ascii, + continuation, + continuation, + continuation, + continuation, + ascii, + ], + &[ + (b"_?_??_???_????_", "literal"), + ( + b"_\xC0_\xA7\xA7_\xA7\xA7\xA7_\xA7\xA7\xA7\xA7_", + "literal-show", + ), + ( + b"_\\300_\\247\\247_\\247\\247\\247_\\247\\247\\247\\247_", + "escape", + ), + ( + b"\"_\\300_\\247\\247_\\247\\247\\247_\\247\\247\\247\\247_\"", + "c", + ), + (b"_?_??_???_????_", "shell"), + ( + b"_\xC0_\xA7\xA7_\xA7\xA7\xA7_\xA7\xA7\xA7\xA7_", + "shell-show", + ), + (b"'_?_??_???_????_'", "shell-always"), + ( + b"'_\xC0_\xA7\xA7_\xA7\xA7\xA7_\xA7\xA7\xA7\xA7_'", + "shell-always-show", + ), + ( + b"'_'$'\\300''_'$'\\247\\247''_'$'\\247\\247\\247''_'$'\\247\\247\\247\\247''_'", + "shell-escape", + ), + ( + b"'_'$'\\300''_'$'\\247\\247''_'$'\\247\\247\\247''_'$'\\247\\247\\247\\247''_'", + "shell-escape-always", + ), + ], + ); + + // invalid multi-byte sequences that start valid + check_names_raw( + &[first2byte, ascii], + &[ + (b"?_", "literal"), + (b"\xC2_", "literal-show"), + (b"\\302_", "escape"), + (b"\"\\302_\"", "c"), + (b"?_", "shell"), + (b"\xC2_", "shell-show"), + (b"'?_'", "shell-always"), + (b"'\xC2_'", "shell-always-show"), + (b"''$'\\302''_'", "shell-escape"), + (b"''$'\\302''_'", "shell-escape-always"), + ], + ); + check_names_raw( + &[first2byte, first2byte, continuation], + &[ + (b"?\xC2\xA7", "literal"), + (b"\xC2\xC2\xA7", "literal-show"), + (b"\\302\xC2\xA7", "escape"), + (b"\"\\302\xC2\xA7\"", "c"), + (b"?\xC2\xA7", "shell"), + (b"\xC2\xC2\xA7", "shell-show"), + (b"'?\xC2\xA7'", "shell-always"), + (b"'\xC2\xC2\xA7'", "shell-always-show"), + (b"''$'\\302''\xC2\xA7'", "shell-escape"), + (b"''$'\\302''\xC2\xA7'", "shell-escape-always"), + ], + ); + check_names_raw( + &[first3byte, continuation, ascii], + &[ + (b"??_", "literal"), + (b"\xE0\xA7_", "literal-show"), + (b"\\340\\247_", "escape"), + (b"\"\\340\\247_\"", "c"), + (b"??_", "shell"), + (b"\xE0\xA7_", "shell-show"), + (b"'??_'", "shell-always"), + (b"'\xE0\xA7_'", "shell-always-show"), + (b"''$'\\340\\247''_'", "shell-escape"), + (b"''$'\\340\\247''_'", "shell-escape-always"), + ], + ); + check_names_raw( + &[first4byte, continuation, continuation, ascii], + &[ + (b"???_", "literal"), + (b"\xF0\xA7\xA7_", "literal-show"), + (b"\\360\\247\\247_", "escape"), + (b"\"\\360\\247\\247_\"", "c"), + (b"???_", "shell"), + (b"\xF0\xA7\xA7_", "shell-show"), + (b"'???_'", "shell-always"), + (b"'\xF0\xA7\xA7_'", "shell-always-show"), + (b"''$'\\360\\247\\247''_'", "shell-escape"), + (b"''$'\\360\\247\\247''_'", "shell-escape-always"), + ], + ); + } + #[test] fn test_question_mark() { // A question mark must force quotes in shell and shell-always, unless