1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-08-01 05:27:45 +00:00

quoting_style: add support for non-unicode bytes

This new functionality is implemented, but not yet exposed here.
This commit is contained in:
Justin Tracey 2024-11-21 22:35:42 -05:00
parent cb3be5e3aa
commit 355103134b
No known key found for this signature in database
GPG key ID: 62B84F5ABDDDCE54

View file

@ -11,34 +11,38 @@ use std::fmt;
// These are characters with special meaning in the shell (e.g. bash). // These are characters with special meaning in the shell (e.g. bash).
// The first const contains characters that only have a special meaning when they appear at the beginning of a name. // The first const contains characters that only have a special meaning when they appear at the beginning of a name.
const SPECIAL_SHELL_CHARS_START: &[char] = &['~', '#']; const SPECIAL_SHELL_CHARS_START: &[u8] = b"~#";
// PR#6559 : Remove `]{}` from special shell chars. // PR#6559 : Remove `]{}` from special shell chars.
const SPECIAL_SHELL_CHARS: &str = "`$&*()|[;\\'\"<>?! "; const SPECIAL_SHELL_CHARS: &str = "`$&*()|[;\\'\"<>?! ";
/// The quoting style to use when escaping a name. /// The quoting style to use when escaping a name.
#[derive(Clone, Copy, Debug, Eq, PartialEq)] #[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum QuotingStyle { pub enum QuotingStyle {
/// Escape the name as a literal string. /// Escape the name as a shell string.
/// Used in, e.g., `ls --quoting-style=shell`.
Shell { Shell {
/// Whether to escape characters in the name. /// Whether to escape characters in the name.
/// True in, e.g., `ls --quoting-style=shell-escape`.
escape: bool, escape: bool,
/// Whether to always quote the name. /// Whether to always quote the name.
always_quote: bool, always_quote: bool,
/// Whether to show control characters. /// Whether to show control and non-unicode characters, or replace them with `?`.
show_control: bool, show_control: bool,
}, },
/// Escape the name as a C string. /// Escape the name as a C string.
/// Used in, e.g., `ls --quote-name`.
C { C {
/// The type of quotes to use. /// The type of quotes to use.
quotes: Quotes, quotes: Quotes,
}, },
/// Escape the name as a literal string. /// Do not escape the string.
/// Used in, e.g., `ls --literal`.
Literal { Literal {
/// Whether to show control characters. /// Whether to show control and non-unicode characters, or replace them with `?`.
show_control: bool, show_control: bool,
}, },
} }
@ -72,8 +76,9 @@ enum EscapeState {
Octal(EscapeOctal), Octal(EscapeOctal),
} }
/// Byte we need to present as escaped octal, in the form of `\nnn`
struct EscapeOctal { struct EscapeOctal {
c: char, c: u8,
state: EscapeOctalState, state: EscapeOctalState,
idx: usize, idx: usize,
} }
@ -95,20 +100,20 @@ impl Iterator for EscapeOctal {
Some('\\') Some('\\')
} }
EscapeOctalState::Value => { EscapeOctalState::Value => {
let octal_digit = ((self.c as u32) >> (self.idx * 3)) & 0o7; let octal_digit = ((self.c) >> (self.idx * 3)) & 0o7;
if self.idx == 0 { if self.idx == 0 {
self.state = EscapeOctalState::Done; self.state = EscapeOctalState::Done;
} else { } else {
self.idx -= 1; self.idx -= 1;
} }
Some(from_digit(octal_digit, 8).unwrap()) Some(from_digit(octal_digit.into(), 8).unwrap())
} }
} }
} }
} }
impl EscapeOctal { impl EscapeOctal {
fn from(c: char) -> Self { fn from(c: u8) -> Self {
Self { Self {
c, c,
idx: 2, idx: 2,
@ -124,6 +129,12 @@ impl EscapedChar {
} }
} }
fn new_octal(b: u8) -> Self {
Self {
state: EscapeState::Octal(EscapeOctal::from(b)),
}
}
fn new_c(c: char, quotes: Quotes, dirname: bool) -> Self { fn new_c(c: char, quotes: Quotes, dirname: bool) -> Self {
use EscapeState::*; use EscapeState::*;
let init_state = match c { let init_state = match c {
@ -148,7 +159,7 @@ impl EscapedChar {
_ => Char(' '), _ => Char(' '),
}, },
':' if dirname => Backslash(':'), ':' if dirname => Backslash(':'),
_ if c.is_ascii_control() => Octal(EscapeOctal::from(c)), _ if c.is_ascii_control() => Octal(EscapeOctal::from(c as u8)),
_ => Char(c), _ => Char(c),
}; };
Self { state: init_state } Self { state: init_state }
@ -165,7 +176,7 @@ impl EscapedChar {
'\x0B' => Backslash('v'), '\x0B' => Backslash('v'),
'\x0C' => Backslash('f'), '\x0C' => Backslash('f'),
'\r' => Backslash('r'), '\r' => Backslash('r'),
'\x00'..='\x1F' | '\x7F' => Octal(EscapeOctal::from(c)), '\x00'..='\x1F' | '\x7F' => Octal(EscapeOctal::from(c as u8)),
'\'' => match quotes { '\'' => match quotes {
Quotes::Single => Backslash('\''), Quotes::Single => Backslash('\''),
_ => Char('\''), _ => Char('\''),
@ -205,11 +216,18 @@ impl Iterator for EscapedChar {
} }
} }
fn shell_without_escape(name: &str, quotes: Quotes, show_control_chars: bool) -> (String, bool) { /// Check whether `bytes` starts with any byte in `pattern`.
let mut must_quote = false; fn bytes_start_with(bytes: &[u8], pattern: &[u8]) -> bool {
let mut escaped_str = String::with_capacity(name.len()); !bytes.is_empty() && pattern.contains(&bytes[0])
}
for c in name.chars() { fn shell_without_escape(name: &[u8], quotes: Quotes, show_control_chars: bool) -> (Vec<u8>, bool) {
let mut must_quote = false;
let mut escaped_str = Vec::with_capacity(name.len());
let mut utf8_buf = vec![0; 4];
for s in name.utf8_chunks() {
for c in s.valid().chars() {
let escaped = { let escaped = {
let ec = EscapedChar::new_shell(c, false, quotes); let ec = EscapedChar::new_shell(c, false, quotes);
if show_control_chars { if show_control_chars {
@ -220,31 +238,39 @@ fn shell_without_escape(name: &str, quotes: Quotes, show_control_chars: bool) ->
}; };
match escaped.state { match escaped.state {
EscapeState::Backslash('\'') => escaped_str.push_str("'\\''"), EscapeState::Backslash('\'') => escaped_str.extend_from_slice(b"'\\''"),
EscapeState::ForceQuote(x) => { EscapeState::ForceQuote(x) => {
must_quote = true; must_quote = true;
escaped_str.push(x); escaped_str.extend_from_slice(x.encode_utf8(&mut utf8_buf).as_bytes());
} }
_ => { _ => {
for char in escaped { for c in escaped {
escaped_str.push(char); escaped_str.extend_from_slice(c.encode_utf8(&mut utf8_buf).as_bytes());
} }
} }
} }
} }
must_quote = must_quote || name.starts_with(SPECIAL_SHELL_CHARS_START); if show_control_chars {
escaped_str.extend_from_slice(s.invalid());
} else {
escaped_str.resize(escaped_str.len() + s.invalid().len(), b'?');
}
}
must_quote = must_quote || bytes_start_with(name, SPECIAL_SHELL_CHARS_START);
(escaped_str, must_quote) (escaped_str, must_quote)
} }
fn shell_with_escape(name: &str, quotes: Quotes) -> (String, bool) { fn shell_with_escape(name: &[u8], quotes: Quotes) -> (Vec<u8>, bool) {
// We need to keep track of whether we are in a dollar expression // We need to keep track of whether we are in a dollar expression
// because e.g. \b\n is escaped as $'\b\n' and not like $'b'$'n' // because e.g. \b\n is escaped as $'\b\n' and not like $'b'$'n'
let mut in_dollar = false; let mut in_dollar = false;
let mut must_quote = false; let mut must_quote = false;
let mut escaped_str = String::with_capacity(name.len()); let mut escaped_str = String::with_capacity(name.len());
for c in name.chars() { for s in name.utf8_chunks() {
for c in s.valid().chars() {
let escaped = EscapedChar::new_shell(c, true, quotes); let escaped = EscapedChar::new_shell(c, true, quotes);
match escaped.state { match escaped.state {
EscapeState::Char(x) => { EscapeState::Char(x) => {
@ -282,25 +308,32 @@ fn shell_with_escape(name: &str, quotes: Quotes) -> (String, bool) {
} }
} }
} }
must_quote = must_quote || name.starts_with(SPECIAL_SHELL_CHARS_START); if !s.invalid().is_empty() {
(escaped_str, must_quote) if !in_dollar {
escaped_str.push_str("'$'");
in_dollar = true;
}
must_quote = true;
let escaped_bytes: String = s
.invalid()
.iter()
.flat_map(|b| EscapedChar::new_octal(*b))
.collect();
escaped_str.push_str(&escaped_bytes);
}
}
must_quote = must_quote || bytes_start_with(name, SPECIAL_SHELL_CHARS_START);
(escaped_str.into(), must_quote)
} }
/// Return a set of characters that implies quoting of the word in /// Return a set of characters that implies quoting of the word in
/// shell-quoting mode. /// shell-quoting mode.
fn shell_escaped_char_set(is_dirname: bool) -> &'static [char] { fn shell_escaped_char_set(is_dirname: bool) -> &'static [u8] {
const ESCAPED_CHARS: &[char] = &[ const ESCAPED_CHARS: &[u8] = b":\"`$\\^\n\t\r=";
// the ':' colon character only induce quoting in the // the ':' colon character only induce quoting in the
// context of ls displaying a directory name before listing its content. // context of ls displaying a directory name before listing its content.
// (e.g. with the recursive flag -R) // (e.g. with the recursive flag -R)
':',
// Under this line are the control characters that should be
// quoted in shell mode in all cases.
'"', '`', '$', '\\', '^', '\n', '\t', '\r', '=',
];
let start_index = if is_dirname { 0 } else { 1 }; let start_index = if is_dirname { 0 } else { 1 };
&ESCAPED_CHARS[start_index..] &ESCAPED_CHARS[start_index..]
} }
@ -308,41 +341,57 @@ fn shell_escaped_char_set(is_dirname: bool) -> &'static [char] {
/// ///
/// This inner function provides an additional flag `dirname` which /// This inner function provides an additional flag `dirname` which
/// is meant for ls' directory name display. /// is meant for ls' directory name display.
fn escape_name_inner(name: &OsStr, style: &QuotingStyle, dirname: bool) -> String { fn escape_name_inner(name: &[u8], style: &QuotingStyle, dirname: bool) -> Vec<u8> {
match style { match style {
QuotingStyle::Literal { show_control } => { QuotingStyle::Literal { show_control } => {
if *show_control { if *show_control {
name.to_string_lossy().into_owned() name.to_owned()
} else { } else {
name.to_string_lossy() name.utf8_chunks()
.map(|s| {
let valid: String = s
.valid()
.chars() .chars()
.flat_map(|c| EscapedChar::new_literal(c).hide_control()) .flat_map(|c| EscapedChar::new_literal(c).hide_control())
.collect() .collect();
let invalid = "?".repeat(s.invalid().len());
valid + &invalid
})
.collect::<String>()
.into()
} }
} }
QuotingStyle::C { quotes } => { QuotingStyle::C { quotes } => {
let escaped_str: String = name let escaped_str: String = name
.to_string_lossy() .utf8_chunks()
.flat_map(|s| {
let valid = s
.valid()
.chars() .chars()
.flat_map(|c| EscapedChar::new_c(c, *quotes, dirname)) .flat_map(|c| EscapedChar::new_c(c, *quotes, dirname));
.collect(); let invalid = s.invalid().iter().flat_map(|b| EscapedChar::new_octal(*b));
valid.chain(invalid)
})
.collect::<String>();
match quotes { match quotes {
Quotes::Single => format!("'{escaped_str}'"), Quotes::Single => format!("'{escaped_str}'"),
Quotes::Double => format!("\"{escaped_str}\""), Quotes::Double => format!("\"{escaped_str}\""),
Quotes::None => escaped_str, Quotes::None => escaped_str,
} }
.into()
} }
QuotingStyle::Shell { QuotingStyle::Shell {
escape, escape,
always_quote, always_quote,
show_control, show_control,
} => { } => {
let name = name.to_string_lossy(); let (quotes, must_quote) = if name
.iter()
let (quotes, must_quote) = if name.contains(shell_escaped_char_set(dirname)) { .any(|c| shell_escaped_char_set(dirname).contains(c))
{
(Quotes::Single, true) (Quotes::Single, true)
} else if name.contains('\'') { } else if name.contains(&b'\'') {
(Quotes::Double, true) (Quotes::Double, true)
} else if *always_quote { } else if *always_quote {
(Quotes::Single, true) (Quotes::Single, true)
@ -351,15 +400,24 @@ fn escape_name_inner(name: &OsStr, style: &QuotingStyle, dirname: bool) -> Strin
}; };
let (escaped_str, contains_quote_chars) = if *escape { let (escaped_str, contains_quote_chars) = if *escape {
shell_with_escape(&name, quotes) shell_with_escape(name, quotes)
} else { } else {
shell_without_escape(&name, quotes, *show_control) shell_without_escape(name, quotes, *show_control)
}; };
match (must_quote | contains_quote_chars, quotes) { if must_quote | contains_quote_chars && quotes != Quotes::None {
(true, Quotes::Single) => format!("'{escaped_str}'"), let mut quoted_str = Vec::<u8>::with_capacity(escaped_str.len() + 2);
(true, Quotes::Double) => format!("\"{escaped_str}\""), let quote = if quotes == Quotes::Single {
_ => escaped_str, b'\''
} else {
b'"'
};
quoted_str.push(quote);
quoted_str.extend(escaped_str);
quoted_str.push(quote);
quoted_str
} else {
escaped_str
} }
} }
} }
@ -367,14 +425,16 @@ fn escape_name_inner(name: &OsStr, style: &QuotingStyle, dirname: bool) -> Strin
/// Escape a filename with respect to the given style. /// Escape a filename with respect to the given style.
pub fn escape_name(name: &OsStr, style: &QuotingStyle) -> String { pub fn escape_name(name: &OsStr, style: &QuotingStyle) -> String {
escape_name_inner(name, style, false) let name = name.to_string_lossy();
String::from_utf8_lossy(&escape_name_inner(name.as_bytes(), style, false)).to_string()
} }
/// Escape a directory name with respect to the given style. /// Escape a directory name with respect to the given style.
/// This is mainly meant to be used for ls' directory name printing and is not /// This is mainly meant to be used for ls' directory name printing and is not
/// likely to be used elsewhere. /// likely to be used elsewhere.
pub fn escape_dir_name(dir_name: &OsStr, style: &QuotingStyle) -> String { pub fn escape_dir_name(dir_name: &OsStr, style: &QuotingStyle) -> String {
escape_name_inner(dir_name, style, true) let dir_name = dir_name.to_string_lossy();
String::from_utf8_lossy(&escape_name_inner(dir_name.as_bytes(), style, true)).to_string()
} }
impl fmt::Display for QuotingStyle { impl fmt::Display for QuotingStyle {
@ -415,7 +475,7 @@ impl fmt::Display for Quotes {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::quoting_style::{escape_name, Quotes, QuotingStyle}; use crate::quoting_style::{escape_name_inner, Quotes, QuotingStyle};
// spell-checker:ignore (tests/words) one\'two one'two // spell-checker:ignore (tests/words) one\'two one'two
@ -465,14 +525,31 @@ mod tests {
} }
} }
fn check_names_inner<T>(name: &[u8], map: &[(T, &str)]) -> Vec<Vec<u8>> {
map.iter()
.map(|(_, style)| escape_name_inner(name, &get_style(style), false))
.collect()
}
fn check_names(name: &str, map: &[(&str, &str)]) { fn check_names(name: &str, map: &[(&str, &str)]) {
assert_eq!( assert_eq!(
map.iter() map.iter()
.map(|(_, style)| escape_name(name.as_ref(), &get_style(style))) .map(|(correct, _)| *correct)
.collect::<Vec<String>>(), .collect::<Vec<&str>>(),
check_names_inner(name.as_bytes(), map)
.iter()
.map(|bytes| std::str::from_utf8(bytes)
.expect("valid str goes in, valid str comes out"))
.collect::<Vec<&str>>()
);
}
fn check_names_raw(name: &[u8], map: &[(&[u8], &str)]) {
assert_eq!(
map.iter() map.iter()
.map(|(correct, _)| correct.to_string()) .map(|(correct, _)| *correct)
.collect::<Vec<String>>() .collect::<Vec<&[u8]>>(),
check_names_inner(name, map)
); );
} }
@ -732,6 +809,229 @@ mod tests {
); );
} }
#[test]
fn test_non_unicode_bytes() {
let ascii = b'_';
let continuation = b'\xA7';
let first2byte = b'\xC2';
let first3byte = b'\xE0';
let first4byte = b'\xF0';
let invalid = b'\xC0';
// a single byte value invalid outside of additional context in UTF-8
check_names_raw(
&[continuation],
&[
(b"?", "literal"),
(b"\xA7", "literal-show"),
(b"\\247", "escape"),
(b"\"\\247\"", "c"),
(b"?", "shell"),
(b"\xA7", "shell-show"),
(b"'?'", "shell-always"),
(b"'\xA7'", "shell-always-show"),
(b"''$'\\247'", "shell-escape"),
(b"''$'\\247'", "shell-escape-always"),
],
);
// ...but the byte becomes valid with appropriate context
// (this is just the § character in UTF-8, written as bytes)
check_names_raw(
&[first2byte, continuation],
&[
(b"\xC2\xA7", "literal"),
(b"\xC2\xA7", "literal-show"),
(b"\xC2\xA7", "escape"),
(b"\"\xC2\xA7\"", "c"),
(b"\xC2\xA7", "shell"),
(b"\xC2\xA7", "shell-show"),
(b"'\xC2\xA7'", "shell-always"),
(b"'\xC2\xA7'", "shell-always-show"),
(b"\xC2\xA7", "shell-escape"),
(b"'\xC2\xA7'", "shell-escape-always"),
],
);
// mixed with valid characters
check_names_raw(
&[continuation, ascii],
&[
(b"?_", "literal"),
(b"\xA7_", "literal-show"),
(b"\\247_", "escape"),
(b"\"\\247_\"", "c"),
(b"?_", "shell"),
(b"\xA7_", "shell-show"),
(b"'?_'", "shell-always"),
(b"'\xA7_'", "shell-always-show"),
(b"''$'\\247''_'", "shell-escape"),
(b"''$'\\247''_'", "shell-escape-always"),
],
);
check_names_raw(
&[ascii, continuation],
&[
(b"_?", "literal"),
(b"_\xA7", "literal-show"),
(b"_\\247", "escape"),
(b"\"_\\247\"", "c"),
(b"_?", "shell"),
(b"_\xA7", "shell-show"),
(b"'_?'", "shell-always"),
(b"'_\xA7'", "shell-always-show"),
(b"'_'$'\\247'", "shell-escape"),
(b"'_'$'\\247'", "shell-escape-always"),
],
);
check_names_raw(
&[ascii, continuation, ascii],
&[
(b"_?_", "literal"),
(b"_\xA7_", "literal-show"),
(b"_\\247_", "escape"),
(b"\"_\\247_\"", "c"),
(b"_?_", "shell"),
(b"_\xA7_", "shell-show"),
(b"'_?_'", "shell-always"),
(b"'_\xA7_'", "shell-always-show"),
(b"'_'$'\\247''_'", "shell-escape"),
(b"'_'$'\\247''_'", "shell-escape-always"),
],
);
check_names_raw(
&[continuation, ascii, continuation],
&[
(b"?_?", "literal"),
(b"\xA7_\xA7", "literal-show"),
(b"\\247_\\247", "escape"),
(b"\"\\247_\\247\"", "c"),
(b"?_?", "shell"),
(b"\xA7_\xA7", "shell-show"),
(b"'?_?'", "shell-always"),
(b"'\xA7_\xA7'", "shell-always-show"),
(b"''$'\\247''_'$'\\247'", "shell-escape"),
(b"''$'\\247''_'$'\\247'", "shell-escape-always"),
],
);
// contiguous invalid bytes
check_names_raw(
&[
ascii,
invalid,
ascii,
continuation,
continuation,
ascii,
continuation,
continuation,
continuation,
ascii,
continuation,
continuation,
continuation,
continuation,
ascii,
],
&[
(b"_?_??_???_????_", "literal"),
(
b"_\xC0_\xA7\xA7_\xA7\xA7\xA7_\xA7\xA7\xA7\xA7_",
"literal-show",
),
(
b"_\\300_\\247\\247_\\247\\247\\247_\\247\\247\\247\\247_",
"escape",
),
(
b"\"_\\300_\\247\\247_\\247\\247\\247_\\247\\247\\247\\247_\"",
"c",
),
(b"_?_??_???_????_", "shell"),
(
b"_\xC0_\xA7\xA7_\xA7\xA7\xA7_\xA7\xA7\xA7\xA7_",
"shell-show",
),
(b"'_?_??_???_????_'", "shell-always"),
(
b"'_\xC0_\xA7\xA7_\xA7\xA7\xA7_\xA7\xA7\xA7\xA7_'",
"shell-always-show",
),
(
b"'_'$'\\300''_'$'\\247\\247''_'$'\\247\\247\\247''_'$'\\247\\247\\247\\247''_'",
"shell-escape",
),
(
b"'_'$'\\300''_'$'\\247\\247''_'$'\\247\\247\\247''_'$'\\247\\247\\247\\247''_'",
"shell-escape-always",
),
],
);
// invalid multi-byte sequences that start valid
check_names_raw(
&[first2byte, ascii],
&[
(b"?_", "literal"),
(b"\xC2_", "literal-show"),
(b"\\302_", "escape"),
(b"\"\\302_\"", "c"),
(b"?_", "shell"),
(b"\xC2_", "shell-show"),
(b"'?_'", "shell-always"),
(b"'\xC2_'", "shell-always-show"),
(b"''$'\\302''_'", "shell-escape"),
(b"''$'\\302''_'", "shell-escape-always"),
],
);
check_names_raw(
&[first2byte, first2byte, continuation],
&[
(b"?\xC2\xA7", "literal"),
(b"\xC2\xC2\xA7", "literal-show"),
(b"\\302\xC2\xA7", "escape"),
(b"\"\\302\xC2\xA7\"", "c"),
(b"?\xC2\xA7", "shell"),
(b"\xC2\xC2\xA7", "shell-show"),
(b"'?\xC2\xA7'", "shell-always"),
(b"'\xC2\xC2\xA7'", "shell-always-show"),
(b"''$'\\302''\xC2\xA7'", "shell-escape"),
(b"''$'\\302''\xC2\xA7'", "shell-escape-always"),
],
);
check_names_raw(
&[first3byte, continuation, ascii],
&[
(b"??_", "literal"),
(b"\xE0\xA7_", "literal-show"),
(b"\\340\\247_", "escape"),
(b"\"\\340\\247_\"", "c"),
(b"??_", "shell"),
(b"\xE0\xA7_", "shell-show"),
(b"'??_'", "shell-always"),
(b"'\xE0\xA7_'", "shell-always-show"),
(b"''$'\\340\\247''_'", "shell-escape"),
(b"''$'\\340\\247''_'", "shell-escape-always"),
],
);
check_names_raw(
&[first4byte, continuation, continuation, ascii],
&[
(b"???_", "literal"),
(b"\xF0\xA7\xA7_", "literal-show"),
(b"\\360\\247\\247_", "escape"),
(b"\"\\360\\247\\247_\"", "c"),
(b"???_", "shell"),
(b"\xF0\xA7\xA7_", "shell-show"),
(b"'???_'", "shell-always"),
(b"'\xF0\xA7\xA7_'", "shell-always-show"),
(b"''$'\\360\\247\\247''_'", "shell-escape"),
(b"''$'\\360\\247\\247''_'", "shell-escape-always"),
],
);
}
#[test] #[test]
fn test_question_mark() { fn test_question_mark() {
// A question mark must force quotes in shell and shell-always, unless // A question mark must force quotes in shell and shell-always, unless