1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-30 12:37:49 +00:00

quoting_style: add support for non-unicode bytes

This new functionality is implemented, but not yet exposed here.
This commit is contained in:
Justin Tracey 2024-11-21 22:35:42 -05:00
parent cb3be5e3aa
commit 355103134b
No known key found for this signature in database
GPG key ID: 62B84F5ABDDDCE54

View file

@ -11,34 +11,38 @@ use std::fmt;
// These are characters with special meaning in the shell (e.g. bash).
// The first const contains characters that only have a special meaning when they appear at the beginning of a name.
const SPECIAL_SHELL_CHARS_START: &[char] = &['~', '#'];
const SPECIAL_SHELL_CHARS_START: &[u8] = b"~#";
// PR#6559 : Remove `]{}` from special shell chars.
const SPECIAL_SHELL_CHARS: &str = "`$&*()|[;\\'\"<>?! ";
/// The quoting style to use when escaping a name.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum QuotingStyle {
/// Escape the name as a literal string.
/// Escape the name as a shell string.
/// Used in, e.g., `ls --quoting-style=shell`.
Shell {
/// Whether to escape characters in the name.
/// True in, e.g., `ls --quoting-style=shell-escape`.
escape: bool,
/// Whether to always quote the name.
always_quote: bool,
/// Whether to show control characters.
/// Whether to show control and non-unicode characters, or replace them with `?`.
show_control: bool,
},
/// Escape the name as a C string.
/// Used in, e.g., `ls --quote-name`.
C {
/// The type of quotes to use.
quotes: Quotes,
},
/// Escape the name as a literal string.
/// Do not escape the string.
/// Used in, e.g., `ls --literal`.
Literal {
/// Whether to show control characters.
/// Whether to show control and non-unicode characters, or replace them with `?`.
show_control: bool,
},
}
@ -72,8 +76,9 @@ enum EscapeState {
Octal(EscapeOctal),
}
/// Byte we need to present as escaped octal, in the form of `\nnn`
struct EscapeOctal {
c: char,
c: u8,
state: EscapeOctalState,
idx: usize,
}
@ -95,20 +100,20 @@ impl Iterator for EscapeOctal {
Some('\\')
}
EscapeOctalState::Value => {
let octal_digit = ((self.c as u32) >> (self.idx * 3)) & 0o7;
let octal_digit = ((self.c) >> (self.idx * 3)) & 0o7;
if self.idx == 0 {
self.state = EscapeOctalState::Done;
} else {
self.idx -= 1;
}
Some(from_digit(octal_digit, 8).unwrap())
Some(from_digit(octal_digit.into(), 8).unwrap())
}
}
}
}
impl EscapeOctal {
fn from(c: char) -> Self {
fn from(c: u8) -> Self {
Self {
c,
idx: 2,
@ -124,6 +129,12 @@ impl EscapedChar {
}
}
fn new_octal(b: u8) -> Self {
Self {
state: EscapeState::Octal(EscapeOctal::from(b)),
}
}
fn new_c(c: char, quotes: Quotes, dirname: bool) -> Self {
use EscapeState::*;
let init_state = match c {
@ -148,7 +159,7 @@ impl EscapedChar {
_ => Char(' '),
},
':' if dirname => Backslash(':'),
_ if c.is_ascii_control() => Octal(EscapeOctal::from(c)),
_ if c.is_ascii_control() => Octal(EscapeOctal::from(c as u8)),
_ => Char(c),
};
Self { state: init_state }
@ -165,7 +176,7 @@ impl EscapedChar {
'\x0B' => Backslash('v'),
'\x0C' => Backslash('f'),
'\r' => Backslash('r'),
'\x00'..='\x1F' | '\x7F' => Octal(EscapeOctal::from(c)),
'\x00'..='\x1F' | '\x7F' => Octal(EscapeOctal::from(c as u8)),
'\'' => match quotes {
Quotes::Single => Backslash('\''),
_ => Char('\''),
@ -205,102 +216,124 @@ impl Iterator for EscapedChar {
}
}
fn shell_without_escape(name: &str, quotes: Quotes, show_control_chars: bool) -> (String, bool) {
/// Check whether `bytes` starts with any byte in `pattern`.
fn bytes_start_with(bytes: &[u8], pattern: &[u8]) -> bool {
!bytes.is_empty() && pattern.contains(&bytes[0])
}
fn shell_without_escape(name: &[u8], quotes: Quotes, show_control_chars: bool) -> (Vec<u8>, bool) {
let mut must_quote = false;
let mut escaped_str = String::with_capacity(name.len());
let mut escaped_str = Vec::with_capacity(name.len());
let mut utf8_buf = vec![0; 4];
for c in name.chars() {
let escaped = {
let ec = EscapedChar::new_shell(c, false, quotes);
if show_control_chars {
ec
} else {
ec.hide_control()
}
};
for s in name.utf8_chunks() {
for c in s.valid().chars() {
let escaped = {
let ec = EscapedChar::new_shell(c, false, quotes);
if show_control_chars {
ec
} else {
ec.hide_control()
}
};
match escaped.state {
EscapeState::Backslash('\'') => escaped_str.push_str("'\\''"),
EscapeState::ForceQuote(x) => {
must_quote = true;
escaped_str.push(x);
}
_ => {
for char in escaped {
escaped_str.push(char);
match escaped.state {
EscapeState::Backslash('\'') => escaped_str.extend_from_slice(b"'\\''"),
EscapeState::ForceQuote(x) => {
must_quote = true;
escaped_str.extend_from_slice(x.encode_utf8(&mut utf8_buf).as_bytes());
}
_ => {
for c in escaped {
escaped_str.extend_from_slice(c.encode_utf8(&mut utf8_buf).as_bytes());
}
}
}
}
if show_control_chars {
escaped_str.extend_from_slice(s.invalid());
} else {
escaped_str.resize(escaped_str.len() + s.invalid().len(), b'?');
}
}
must_quote = must_quote || name.starts_with(SPECIAL_SHELL_CHARS_START);
must_quote = must_quote || bytes_start_with(name, SPECIAL_SHELL_CHARS_START);
(escaped_str, must_quote)
}
fn shell_with_escape(name: &str, quotes: Quotes) -> (String, bool) {
fn shell_with_escape(name: &[u8], quotes: Quotes) -> (Vec<u8>, bool) {
// We need to keep track of whether we are in a dollar expression
// because e.g. \b\n is escaped as $'\b\n' and not like $'b'$'n'
let mut in_dollar = false;
let mut must_quote = false;
let mut escaped_str = String::with_capacity(name.len());
for c in name.chars() {
let escaped = EscapedChar::new_shell(c, true, quotes);
match escaped.state {
EscapeState::Char(x) => {
if in_dollar {
escaped_str.push_str("''");
for s in name.utf8_chunks() {
for c in s.valid().chars() {
let escaped = EscapedChar::new_shell(c, true, quotes);
match escaped.state {
EscapeState::Char(x) => {
if in_dollar {
escaped_str.push_str("''");
in_dollar = false;
}
escaped_str.push(x);
}
EscapeState::ForceQuote(x) => {
if in_dollar {
escaped_str.push_str("''");
in_dollar = false;
}
must_quote = true;
escaped_str.push(x);
}
// Single quotes are not put in dollar expressions, but are escaped
// if the string also contains double quotes. In that case, they must
// be handled separately.
EscapeState::Backslash('\'') => {
must_quote = true;
in_dollar = false;
escaped_str.push_str("'\\''");
}
escaped_str.push(x);
}
EscapeState::ForceQuote(x) => {
if in_dollar {
escaped_str.push_str("''");
in_dollar = false;
}
must_quote = true;
escaped_str.push(x);
}
// Single quotes are not put in dollar expressions, but are escaped
// if the string also contains double quotes. In that case, they must
// be handled separately.
EscapeState::Backslash('\'') => {
must_quote = true;
in_dollar = false;
escaped_str.push_str("'\\''");
}
_ => {
if !in_dollar {
escaped_str.push_str("'$'");
in_dollar = true;
}
must_quote = true;
for char in escaped {
escaped_str.push(char);
_ => {
if !in_dollar {
escaped_str.push_str("'$'");
in_dollar = true;
}
must_quote = true;
for char in escaped {
escaped_str.push(char);
}
}
}
}
if !s.invalid().is_empty() {
if !in_dollar {
escaped_str.push_str("'$'");
in_dollar = true;
}
must_quote = true;
let escaped_bytes: String = s
.invalid()
.iter()
.flat_map(|b| EscapedChar::new_octal(*b))
.collect();
escaped_str.push_str(&escaped_bytes);
}
}
must_quote = must_quote || name.starts_with(SPECIAL_SHELL_CHARS_START);
(escaped_str, must_quote)
must_quote = must_quote || bytes_start_with(name, SPECIAL_SHELL_CHARS_START);
(escaped_str.into(), must_quote)
}
/// Return a set of characters that implies quoting of the word in
/// shell-quoting mode.
fn shell_escaped_char_set(is_dirname: bool) -> &'static [char] {
const ESCAPED_CHARS: &[char] = &[
// the ':' colon character only induce quoting in the
// context of ls displaying a directory name before listing its content.
// (e.g. with the recursive flag -R)
':',
// Under this line are the control characters that should be
// quoted in shell mode in all cases.
'"', '`', '$', '\\', '^', '\n', '\t', '\r', '=',
];
fn shell_escaped_char_set(is_dirname: bool) -> &'static [u8] {
const ESCAPED_CHARS: &[u8] = b":\"`$\\^\n\t\r=";
// the ':' colon character only induce quoting in the
// context of ls displaying a directory name before listing its content.
// (e.g. with the recursive flag -R)
let start_index = if is_dirname { 0 } else { 1 };
&ESCAPED_CHARS[start_index..]
}
@ -308,41 +341,57 @@ fn shell_escaped_char_set(is_dirname: bool) -> &'static [char] {
///
/// This inner function provides an additional flag `dirname` which
/// is meant for ls' directory name display.
fn escape_name_inner(name: &OsStr, style: &QuotingStyle, dirname: bool) -> String {
fn escape_name_inner(name: &[u8], style: &QuotingStyle, dirname: bool) -> Vec<u8> {
match style {
QuotingStyle::Literal { show_control } => {
if *show_control {
name.to_string_lossy().into_owned()
name.to_owned()
} else {
name.to_string_lossy()
.chars()
.flat_map(|c| EscapedChar::new_literal(c).hide_control())
.collect()
name.utf8_chunks()
.map(|s| {
let valid: String = s
.valid()
.chars()
.flat_map(|c| EscapedChar::new_literal(c).hide_control())
.collect();
let invalid = "?".repeat(s.invalid().len());
valid + &invalid
})
.collect::<String>()
.into()
}
}
QuotingStyle::C { quotes } => {
let escaped_str: String = name
.to_string_lossy()
.chars()
.flat_map(|c| EscapedChar::new_c(c, *quotes, dirname))
.collect();
.utf8_chunks()
.flat_map(|s| {
let valid = s
.valid()
.chars()
.flat_map(|c| EscapedChar::new_c(c, *quotes, dirname));
let invalid = s.invalid().iter().flat_map(|b| EscapedChar::new_octal(*b));
valid.chain(invalid)
})
.collect::<String>();
match quotes {
Quotes::Single => format!("'{escaped_str}'"),
Quotes::Double => format!("\"{escaped_str}\""),
Quotes::None => escaped_str,
}
.into()
}
QuotingStyle::Shell {
escape,
always_quote,
show_control,
} => {
let name = name.to_string_lossy();
let (quotes, must_quote) = if name.contains(shell_escaped_char_set(dirname)) {
let (quotes, must_quote) = if name
.iter()
.any(|c| shell_escaped_char_set(dirname).contains(c))
{
(Quotes::Single, true)
} else if name.contains('\'') {
} else if name.contains(&b'\'') {
(Quotes::Double, true)
} else if *always_quote {
(Quotes::Single, true)
@ -351,15 +400,24 @@ fn escape_name_inner(name: &OsStr, style: &QuotingStyle, dirname: bool) -> Strin
};
let (escaped_str, contains_quote_chars) = if *escape {
shell_with_escape(&name, quotes)
shell_with_escape(name, quotes)
} else {
shell_without_escape(&name, quotes, *show_control)
shell_without_escape(name, quotes, *show_control)
};
match (must_quote | contains_quote_chars, quotes) {
(true, Quotes::Single) => format!("'{escaped_str}'"),
(true, Quotes::Double) => format!("\"{escaped_str}\""),
_ => escaped_str,
if must_quote | contains_quote_chars && quotes != Quotes::None {
let mut quoted_str = Vec::<u8>::with_capacity(escaped_str.len() + 2);
let quote = if quotes == Quotes::Single {
b'\''
} else {
b'"'
};
quoted_str.push(quote);
quoted_str.extend(escaped_str);
quoted_str.push(quote);
quoted_str
} else {
escaped_str
}
}
}
@ -367,14 +425,16 @@ fn escape_name_inner(name: &OsStr, style: &QuotingStyle, dirname: bool) -> Strin
/// Escape a filename with respect to the given style.
pub fn escape_name(name: &OsStr, style: &QuotingStyle) -> String {
escape_name_inner(name, style, false)
let name = name.to_string_lossy();
String::from_utf8_lossy(&escape_name_inner(name.as_bytes(), style, false)).to_string()
}
/// Escape a directory name with respect to the given style.
/// This is mainly meant to be used for ls' directory name printing and is not
/// likely to be used elsewhere.
pub fn escape_dir_name(dir_name: &OsStr, style: &QuotingStyle) -> String {
escape_name_inner(dir_name, style, true)
let dir_name = dir_name.to_string_lossy();
String::from_utf8_lossy(&escape_name_inner(dir_name.as_bytes(), style, true)).to_string()
}
impl fmt::Display for QuotingStyle {
@ -415,7 +475,7 @@ impl fmt::Display for Quotes {
#[cfg(test)]
mod tests {
use crate::quoting_style::{escape_name, Quotes, QuotingStyle};
use crate::quoting_style::{escape_name_inner, Quotes, QuotingStyle};
// spell-checker:ignore (tests/words) one\'two one'two
@ -465,14 +525,31 @@ mod tests {
}
}
fn check_names_inner<T>(name: &[u8], map: &[(T, &str)]) -> Vec<Vec<u8>> {
map.iter()
.map(|(_, style)| escape_name_inner(name, &get_style(style), false))
.collect()
}
fn check_names(name: &str, map: &[(&str, &str)]) {
assert_eq!(
map.iter()
.map(|(_, style)| escape_name(name.as_ref(), &get_style(style)))
.collect::<Vec<String>>(),
.map(|(correct, _)| *correct)
.collect::<Vec<&str>>(),
check_names_inner(name.as_bytes(), map)
.iter()
.map(|bytes| std::str::from_utf8(bytes)
.expect("valid str goes in, valid str comes out"))
.collect::<Vec<&str>>()
);
}
fn check_names_raw(name: &[u8], map: &[(&[u8], &str)]) {
assert_eq!(
map.iter()
.map(|(correct, _)| correct.to_string())
.collect::<Vec<String>>()
.map(|(correct, _)| *correct)
.collect::<Vec<&[u8]>>(),
check_names_inner(name, map)
);
}
@ -732,6 +809,229 @@ mod tests {
);
}
#[test]
fn test_non_unicode_bytes() {
let ascii = b'_';
let continuation = b'\xA7';
let first2byte = b'\xC2';
let first3byte = b'\xE0';
let first4byte = b'\xF0';
let invalid = b'\xC0';
// a single byte value invalid outside of additional context in UTF-8
check_names_raw(
&[continuation],
&[
(b"?", "literal"),
(b"\xA7", "literal-show"),
(b"\\247", "escape"),
(b"\"\\247\"", "c"),
(b"?", "shell"),
(b"\xA7", "shell-show"),
(b"'?'", "shell-always"),
(b"'\xA7'", "shell-always-show"),
(b"''$'\\247'", "shell-escape"),
(b"''$'\\247'", "shell-escape-always"),
],
);
// ...but the byte becomes valid with appropriate context
// (this is just the § character in UTF-8, written as bytes)
check_names_raw(
&[first2byte, continuation],
&[
(b"\xC2\xA7", "literal"),
(b"\xC2\xA7", "literal-show"),
(b"\xC2\xA7", "escape"),
(b"\"\xC2\xA7\"", "c"),
(b"\xC2\xA7", "shell"),
(b"\xC2\xA7", "shell-show"),
(b"'\xC2\xA7'", "shell-always"),
(b"'\xC2\xA7'", "shell-always-show"),
(b"\xC2\xA7", "shell-escape"),
(b"'\xC2\xA7'", "shell-escape-always"),
],
);
// mixed with valid characters
check_names_raw(
&[continuation, ascii],
&[
(b"?_", "literal"),
(b"\xA7_", "literal-show"),
(b"\\247_", "escape"),
(b"\"\\247_\"", "c"),
(b"?_", "shell"),
(b"\xA7_", "shell-show"),
(b"'?_'", "shell-always"),
(b"'\xA7_'", "shell-always-show"),
(b"''$'\\247''_'", "shell-escape"),
(b"''$'\\247''_'", "shell-escape-always"),
],
);
check_names_raw(
&[ascii, continuation],
&[
(b"_?", "literal"),
(b"_\xA7", "literal-show"),
(b"_\\247", "escape"),
(b"\"_\\247\"", "c"),
(b"_?", "shell"),
(b"_\xA7", "shell-show"),
(b"'_?'", "shell-always"),
(b"'_\xA7'", "shell-always-show"),
(b"'_'$'\\247'", "shell-escape"),
(b"'_'$'\\247'", "shell-escape-always"),
],
);
check_names_raw(
&[ascii, continuation, ascii],
&[
(b"_?_", "literal"),
(b"_\xA7_", "literal-show"),
(b"_\\247_", "escape"),
(b"\"_\\247_\"", "c"),
(b"_?_", "shell"),
(b"_\xA7_", "shell-show"),
(b"'_?_'", "shell-always"),
(b"'_\xA7_'", "shell-always-show"),
(b"'_'$'\\247''_'", "shell-escape"),
(b"'_'$'\\247''_'", "shell-escape-always"),
],
);
check_names_raw(
&[continuation, ascii, continuation],
&[
(b"?_?", "literal"),
(b"\xA7_\xA7", "literal-show"),
(b"\\247_\\247", "escape"),
(b"\"\\247_\\247\"", "c"),
(b"?_?", "shell"),
(b"\xA7_\xA7", "shell-show"),
(b"'?_?'", "shell-always"),
(b"'\xA7_\xA7'", "shell-always-show"),
(b"''$'\\247''_'$'\\247'", "shell-escape"),
(b"''$'\\247''_'$'\\247'", "shell-escape-always"),
],
);
// contiguous invalid bytes
check_names_raw(
&[
ascii,
invalid,
ascii,
continuation,
continuation,
ascii,
continuation,
continuation,
continuation,
ascii,
continuation,
continuation,
continuation,
continuation,
ascii,
],
&[
(b"_?_??_???_????_", "literal"),
(
b"_\xC0_\xA7\xA7_\xA7\xA7\xA7_\xA7\xA7\xA7\xA7_",
"literal-show",
),
(
b"_\\300_\\247\\247_\\247\\247\\247_\\247\\247\\247\\247_",
"escape",
),
(
b"\"_\\300_\\247\\247_\\247\\247\\247_\\247\\247\\247\\247_\"",
"c",
),
(b"_?_??_???_????_", "shell"),
(
b"_\xC0_\xA7\xA7_\xA7\xA7\xA7_\xA7\xA7\xA7\xA7_",
"shell-show",
),
(b"'_?_??_???_????_'", "shell-always"),
(
b"'_\xC0_\xA7\xA7_\xA7\xA7\xA7_\xA7\xA7\xA7\xA7_'",
"shell-always-show",
),
(
b"'_'$'\\300''_'$'\\247\\247''_'$'\\247\\247\\247''_'$'\\247\\247\\247\\247''_'",
"shell-escape",
),
(
b"'_'$'\\300''_'$'\\247\\247''_'$'\\247\\247\\247''_'$'\\247\\247\\247\\247''_'",
"shell-escape-always",
),
],
);
// invalid multi-byte sequences that start valid
check_names_raw(
&[first2byte, ascii],
&[
(b"?_", "literal"),
(b"\xC2_", "literal-show"),
(b"\\302_", "escape"),
(b"\"\\302_\"", "c"),
(b"?_", "shell"),
(b"\xC2_", "shell-show"),
(b"'?_'", "shell-always"),
(b"'\xC2_'", "shell-always-show"),
(b"''$'\\302''_'", "shell-escape"),
(b"''$'\\302''_'", "shell-escape-always"),
],
);
check_names_raw(
&[first2byte, first2byte, continuation],
&[
(b"?\xC2\xA7", "literal"),
(b"\xC2\xC2\xA7", "literal-show"),
(b"\\302\xC2\xA7", "escape"),
(b"\"\\302\xC2\xA7\"", "c"),
(b"?\xC2\xA7", "shell"),
(b"\xC2\xC2\xA7", "shell-show"),
(b"'?\xC2\xA7'", "shell-always"),
(b"'\xC2\xC2\xA7'", "shell-always-show"),
(b"''$'\\302''\xC2\xA7'", "shell-escape"),
(b"''$'\\302''\xC2\xA7'", "shell-escape-always"),
],
);
check_names_raw(
&[first3byte, continuation, ascii],
&[
(b"??_", "literal"),
(b"\xE0\xA7_", "literal-show"),
(b"\\340\\247_", "escape"),
(b"\"\\340\\247_\"", "c"),
(b"??_", "shell"),
(b"\xE0\xA7_", "shell-show"),
(b"'??_'", "shell-always"),
(b"'\xE0\xA7_'", "shell-always-show"),
(b"''$'\\340\\247''_'", "shell-escape"),
(b"''$'\\340\\247''_'", "shell-escape-always"),
],
);
check_names_raw(
&[first4byte, continuation, continuation, ascii],
&[
(b"???_", "literal"),
(b"\xF0\xA7\xA7_", "literal-show"),
(b"\\360\\247\\247_", "escape"),
(b"\"\\360\\247\\247_\"", "c"),
(b"???_", "shell"),
(b"\xF0\xA7\xA7_", "shell-show"),
(b"'???_'", "shell-always"),
(b"'\xF0\xA7\xA7_'", "shell-always-show"),
(b"''$'\\360\\247\\247''_'", "shell-escape"),
(b"''$'\\360\\247\\247''_'", "shell-escape-always"),
],
);
}
#[test]
fn test_question_mark() {
// A question mark must force quotes in shell and shell-always, unless