1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-29 03:57:44 +00:00

Merge pull request #6882 from jtracey/quoting_style_bytes

quoting_style: Add support for non-UTF-8 bytes
This commit is contained in:
Sylvestre Ledru 2024-12-21 23:17:43 +01:00 committed by GitHub
commit bb2fb66073
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 584 additions and 175 deletions

View file

@ -1,4 +1,4 @@
msrv = "1.77.0" msrv = "1.79.0"
cognitive-complexity-threshold = 24 cognitive-complexity-threshold = 24
missing-docs-in-crate-items = true missing-docs-in-crate-items = true
check-private-items = true check-private-items = true

View file

@ -11,7 +11,7 @@ env:
PROJECT_NAME: coreutils PROJECT_NAME: coreutils
PROJECT_DESC: "Core universal (cross-platform) utilities" PROJECT_DESC: "Core universal (cross-platform) utilities"
PROJECT_AUTH: "uutils" PROJECT_AUTH: "uutils"
RUST_MIN_SRV: "1.77.0" RUST_MIN_SRV: "1.79.0"
# * style job configuration # * style job configuration
STYLE_FAIL_ON_FAULT: true ## (bool) fail the build if a style job contains a fault (error or warning); may be overridden on a per-job basis STYLE_FAIL_ON_FAULT: true ## (bool) fail the build if a style job contains a fault (error or warning); may be overridden on a per-job basis

View file

@ -16,7 +16,7 @@ repository = "https://github.com/uutils/coreutils"
readme = "README.md" readme = "README.md"
keywords = ["coreutils", "uutils", "cross-platform", "cli", "utility"] keywords = ["coreutils", "uutils", "cross-platform", "cli", "utility"]
categories = ["command-line-utilities"] categories = ["command-line-utilities"]
rust-version = "1.77.0" rust-version = "1.79.0"
edition = "2021" edition = "2021"
build = "build.rs" build = "build.rs"

View file

@ -14,7 +14,7 @@
[![dependency status](https://deps.rs/repo/github/uutils/coreutils/status.svg)](https://deps.rs/repo/github/uutils/coreutils) [![dependency status](https://deps.rs/repo/github/uutils/coreutils/status.svg)](https://deps.rs/repo/github/uutils/coreutils)
[![CodeCov](https://codecov.io/gh/uutils/coreutils/branch/master/graph/badge.svg)](https://codecov.io/gh/uutils/coreutils) [![CodeCov](https://codecov.io/gh/uutils/coreutils/branch/master/graph/badge.svg)](https://codecov.io/gh/uutils/coreutils)
![MSRV](https://img.shields.io/badge/MSRV-1.77.0-brightgreen) ![MSRV](https://img.shields.io/badge/MSRV-1.79.0-brightgreen)
</div> </div>
@ -70,7 +70,7 @@ the [coreutils docs](https://github.com/uutils/uutils.github.io) repository.
### Rust Version ### Rust Version
uutils follows Rust's release channels and is tested against stable, beta and uutils follows Rust's release channels and is tested against stable, beta and
nightly. The current Minimum Supported Rust Version (MSRV) is `1.77.0`. nightly. The current Minimum Supported Rust Version (MSRV) is `1.79.0`.
## Building ## Building

View file

@ -21,7 +21,7 @@ use std::os::windows::fs::MetadataExt;
use std::{ use std::{
cmp::Reverse, cmp::Reverse,
error::Error, error::Error,
ffi::OsString, ffi::{OsStr, OsString},
fmt::{Display, Write as FmtWrite}, fmt::{Display, Write as FmtWrite},
fs::{self, DirEntry, FileType, Metadata, ReadDir}, fs::{self, DirEntry, FileType, Metadata, ReadDir},
io::{stdout, BufWriter, ErrorKind, Stdout, Write}, io::{stdout, BufWriter, ErrorKind, Stdout, Write},
@ -55,7 +55,7 @@ use uucore::libc::{dev_t, major, minor};
#[cfg(unix)] #[cfg(unix)]
use uucore::libc::{S_IXGRP, S_IXOTH, S_IXUSR}; use uucore::libc::{S_IXGRP, S_IXOTH, S_IXUSR};
use uucore::line_ending::LineEnding; use uucore::line_ending::LineEnding;
use uucore::quoting_style::{escape_dir_name, escape_name, QuotingStyle}; use uucore::quoting_style::{self, QuotingStyle};
use uucore::{ use uucore::{
display::Quotable, display::Quotable,
error::{set_exit_code, UError, UResult}, error::{set_exit_code, UError, UResult},
@ -2048,7 +2048,11 @@ impl PathData {
/// file11 /// file11
/// ``` /// ```
fn show_dir_name(path_data: &PathData, out: &mut BufWriter<Stdout>, config: &Config) { fn show_dir_name(path_data: &PathData, out: &mut BufWriter<Stdout>, config: &Config) {
let escaped_name = escape_dir_name(path_data.p_buf.as_os_str(), &config.quoting_style); // FIXME: replace this with appropriate behavior for literal unprintable bytes
let escaped_name =
quoting_style::escape_dir_name(path_data.p_buf.as_os_str(), &config.quoting_style)
.to_string_lossy()
.to_string();
let name = if config.hyperlink && !config.dired { let name = if config.hyperlink && !config.dired {
create_hyperlink(&escaped_name, path_data) create_hyperlink(&escaped_name, path_data)
@ -3002,7 +3006,6 @@ use std::sync::Mutex;
#[cfg(unix)] #[cfg(unix)]
use uucore::entries; use uucore::entries;
use uucore::fs::FileInformation; use uucore::fs::FileInformation;
use uucore::quoting_style;
#[cfg(unix)] #[cfg(unix)]
fn cached_uid2usr(uid: u32) -> String { fn cached_uid2usr(uid: u32) -> String {
@ -3542,3 +3545,10 @@ fn calculate_padding_collection(
padding_collections padding_collections
} }
// FIXME: replace this with appropriate behavior for literal unprintable bytes
fn escape_name(name: &OsStr, style: &QuotingStyle) -> String {
quoting_style::escape_name(name, style)
.to_string_lossy()
.to_string()
}

View file

@ -13,7 +13,7 @@ mod word_count;
use std::{ use std::{
borrow::{Borrow, Cow}, borrow::{Borrow, Cow},
cmp::max, cmp::max,
ffi::OsString, ffi::{OsStr, OsString},
fs::{self, File}, fs::{self, File},
io::{self, Write}, io::{self, Write},
iter, iter,
@ -28,7 +28,7 @@ use utf8::{BufReadDecoder, BufReadDecoderError};
use uucore::{ use uucore::{
error::{FromIo, UError, UResult}, error::{FromIo, UError, UResult},
format_usage, help_about, help_usage, format_usage, help_about, help_usage,
quoting_style::{escape_name, QuotingStyle}, quoting_style::{self, QuotingStyle},
shortcut_value_parser::ShortcutValueParser, shortcut_value_parser::ShortcutValueParser,
show, show,
}; };
@ -259,7 +259,7 @@ impl<'a> Input<'a> {
match self { match self {
Self::Path(path) => Some(match path.to_str() { Self::Path(path) => Some(match path.to_str() {
Some(s) if !s.contains('\n') => Cow::Borrowed(s), Some(s) if !s.contains('\n') => Cow::Borrowed(s),
_ => Cow::Owned(escape_name(path.as_os_str(), QS_ESCAPE)), _ => Cow::Owned(escape_name_wrapper(path.as_os_str())),
}), }),
Self::Stdin(StdinKind::Explicit) => Some(Cow::Borrowed(STDIN_REPR)), Self::Stdin(StdinKind::Explicit) => Some(Cow::Borrowed(STDIN_REPR)),
Self::Stdin(StdinKind::Implicit) => None, Self::Stdin(StdinKind::Implicit) => None,
@ -269,7 +269,7 @@ impl<'a> Input<'a> {
/// Converts input into the form that appears in errors. /// Converts input into the form that appears in errors.
fn path_display(&self) -> String { fn path_display(&self) -> String {
match self { match self {
Self::Path(path) => escape_name(path.as_os_str(), QS_ESCAPE), Self::Path(path) => escape_name_wrapper(path.as_os_str()),
Self::Stdin(_) => String::from("standard input"), Self::Stdin(_) => String::from("standard input"),
} }
} }
@ -361,7 +361,7 @@ impl WcError {
Some((input, idx)) => { Some((input, idx)) => {
let path = match input { let path = match input {
Input::Stdin(_) => STDIN_REPR.into(), Input::Stdin(_) => STDIN_REPR.into(),
Input::Path(path) => escape_name(path.as_os_str(), QS_ESCAPE).into(), Input::Path(path) => escape_name_wrapper(path.as_os_str()).into(),
}; };
Self::ZeroLengthFileNameCtx { path, idx } Self::ZeroLengthFileNameCtx { path, idx }
} }
@ -761,7 +761,9 @@ fn files0_iter_file<'a>(path: &Path) -> UResult<impl Iterator<Item = InputIterIt
Err(e) => Err(e.map_err_context(|| { Err(e) => Err(e.map_err_context(|| {
format!( format!(
"cannot open {} for reading", "cannot open {} for reading",
escape_name(path.as_os_str(), QS_QUOTE_ESCAPE) quoting_style::escape_name(path.as_os_str(), QS_QUOTE_ESCAPE)
.into_string()
.expect("All escaped names with the escaping option return valid strings.")
) )
})), })),
} }
@ -793,9 +795,9 @@ fn files0_iter<'a>(
Ok(Input::Path(PathBuf::from(s).into())) Ok(Input::Path(PathBuf::from(s).into()))
} }
} }
Err(e) => Err(e.map_err_context(|| { Err(e) => Err(e
format!("{}: read error", escape_name(&err_path, QS_ESCAPE)) .map_err_context(|| format!("{}: read error", escape_name_wrapper(&err_path)))
}) as Box<dyn UError>), as Box<dyn UError>),
}), }),
); );
// Loop until there is an error; yield that error and then nothing else. // Loop until there is an error; yield that error and then nothing else.
@ -808,6 +810,12 @@ fn files0_iter<'a>(
}) })
} }
fn escape_name_wrapper(name: &OsStr) -> String {
quoting_style::escape_name(name, QS_ESCAPE)
.into_string()
.expect("All escaped names with the escaping option return valid strings.")
}
fn wc(inputs: &Inputs, settings: &Settings) -> UResult<()> { fn wc(inputs: &Inputs, settings: &Settings) -> UResult<()> {
let mut total_word_count = WordCount::default(); let mut total_word_count = WordCount::default();
let mut num_inputs: usize = 0; let mut num_inputs: usize = 0;

View file

@ -112,7 +112,8 @@ fn extract_value<T: Default>(p: Result<T, ParseError<'_, T>>, input: &str) -> T
Default::default() Default::default()
} }
ParseError::PartialMatch(v, rest) => { ParseError::PartialMatch(v, rest) => {
if input.starts_with('\'') { let bytes = input.as_encoded_bytes();
if !bytes.is_empty() && bytes[0] == b'\'' {
show_warning!( show_warning!(
"{}: character(s) following character constant have been ignored", "{}: character(s) following character constant have been ignored",
&rest, &rest,

View file

@ -353,20 +353,20 @@ impl Spec {
writer.write_all(&parsed).map_err(FormatError::IoError) writer.write_all(&parsed).map_err(FormatError::IoError)
} }
Self::QuotedString => { Self::QuotedString => {
let s = args.get_str(); let s = escape_name(
writer args.get_str().as_ref(),
.write_all(
escape_name(
s.as_ref(),
&QuotingStyle::Shell { &QuotingStyle::Shell {
escape: true, escape: true,
always_quote: false, always_quote: false,
show_control: false, show_control: false,
}, },
) );
.as_bytes(), #[cfg(unix)]
) let bytes = std::os::unix::ffi::OsStringExt::into_vec(s);
.map_err(FormatError::IoError) #[cfg(not(unix))]
let bytes = s.to_string_lossy().as_bytes().to_owned();
writer.write_all(&bytes).map_err(FormatError::IoError)
} }
Self::SignedInt { Self::SignedInt {
width, width,

View file

@ -6,39 +6,43 @@
//! Set of functions for escaping names according to different quoting styles. //! Set of functions for escaping names according to different quoting styles.
use std::char::from_digit; use std::char::from_digit;
use std::ffi::OsStr; use std::ffi::{OsStr, OsString};
use std::fmt; use std::fmt;
// These are characters with special meaning in the shell (e.g. bash). // These are characters with special meaning in the shell (e.g. bash).
// The first const contains characters that only have a special meaning when they appear at the beginning of a name. // The first const contains characters that only have a special meaning when they appear at the beginning of a name.
const SPECIAL_SHELL_CHARS_START: &[char] = &['~', '#']; const SPECIAL_SHELL_CHARS_START: &[u8] = b"~#";
// PR#6559 : Remove `]{}` from special shell chars. // PR#6559 : Remove `]{}` from special shell chars.
const SPECIAL_SHELL_CHARS: &str = "`$&*()|[;\\'\"<>?! "; const SPECIAL_SHELL_CHARS: &str = "`$&*()|[;\\'\"<>?! ";
/// The quoting style to use when escaping a name. /// The quoting style to use when escaping a name.
#[derive(Clone, Copy, Debug, Eq, PartialEq)] #[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum QuotingStyle { pub enum QuotingStyle {
/// Escape the name as a literal string. /// Escape the name as a shell string.
/// Used in, e.g., `ls --quoting-style=shell`.
Shell { Shell {
/// Whether to escape characters in the name. /// Whether to escape characters in the name.
/// True in, e.g., `ls --quoting-style=shell-escape`.
escape: bool, escape: bool,
/// Whether to always quote the name. /// Whether to always quote the name.
always_quote: bool, always_quote: bool,
/// Whether to show control characters. /// Whether to show control and non-unicode characters, or replace them with `?`.
show_control: bool, show_control: bool,
}, },
/// Escape the name as a C string. /// Escape the name as a C string.
/// Used in, e.g., `ls --quote-name`.
C { C {
/// The type of quotes to use. /// The type of quotes to use.
quotes: Quotes, quotes: Quotes,
}, },
/// Escape the name as a literal string. /// Do not escape the string.
/// Used in, e.g., `ls --literal`.
Literal { Literal {
/// Whether to show control characters. /// Whether to show control and non-unicode characters, or replace them with `?`.
show_control: bool, show_control: bool,
}, },
} }
@ -72,16 +76,24 @@ enum EscapeState {
Octal(EscapeOctal), Octal(EscapeOctal),
} }
/// Bytes we need to present as escaped octal, in the form of `\nnn` per byte.
/// Only supports characters up to 2 bytes long in UTF-8.
struct EscapeOctal { struct EscapeOctal {
c: char, c: [u8; 2],
state: EscapeOctalState, state: EscapeOctalState,
idx: usize, idx: u8,
} }
enum EscapeOctalState { enum EscapeOctalState {
Done, Done,
Backslash, FirstBackslash,
Value, FirstValue,
LastBackslash,
LastValue,
}
fn byte_to_octal_digit(byte: u8, idx: u8) -> u8 {
(byte >> (idx * 3)) & 0o7
} }
impl Iterator for EscapeOctal { impl Iterator for EscapeOctal {
@ -90,29 +102,57 @@ impl Iterator for EscapeOctal {
fn next(&mut self) -> Option<char> { fn next(&mut self) -> Option<char> {
match self.state { match self.state {
EscapeOctalState::Done => None, EscapeOctalState::Done => None,
EscapeOctalState::Backslash => { EscapeOctalState::FirstBackslash => {
self.state = EscapeOctalState::Value; self.state = EscapeOctalState::FirstValue;
Some('\\') Some('\\')
} }
EscapeOctalState::Value => { EscapeOctalState::LastBackslash => {
let octal_digit = ((self.c as u32) >> (self.idx * 3)) & 0o7; self.state = EscapeOctalState::LastValue;
Some('\\')
}
EscapeOctalState::FirstValue => {
let octal_digit = byte_to_octal_digit(self.c[0], self.idx);
if self.idx == 0 {
self.state = EscapeOctalState::LastBackslash;
self.idx = 2;
} else {
self.idx -= 1;
}
Some(from_digit(octal_digit.into(), 8).unwrap())
}
EscapeOctalState::LastValue => {
let octal_digit = byte_to_octal_digit(self.c[1], self.idx);
if self.idx == 0 { if self.idx == 0 {
self.state = EscapeOctalState::Done; self.state = EscapeOctalState::Done;
} else { } else {
self.idx -= 1; self.idx -= 1;
} }
Some(from_digit(octal_digit, 8).unwrap()) Some(from_digit(octal_digit.into(), 8).unwrap())
} }
} }
} }
} }
impl EscapeOctal { impl EscapeOctal {
fn from(c: char) -> Self { fn from_char(c: char) -> Self {
if c.len_utf8() == 1 {
return Self::from_byte(c as u8);
}
let mut buf = [0; 2];
let _s = c.encode_utf8(&mut buf);
Self { Self {
c, c: buf,
idx: 2, idx: 2,
state: EscapeOctalState::Backslash, state: EscapeOctalState::FirstBackslash,
}
}
fn from_byte(b: u8) -> Self {
Self {
c: [0, b],
idx: 2,
state: EscapeOctalState::LastBackslash,
} }
} }
} }
@ -124,6 +164,12 @@ impl EscapedChar {
} }
} }
fn new_octal(b: u8) -> Self {
Self {
state: EscapeState::Octal(EscapeOctal::from_byte(b)),
}
}
fn new_c(c: char, quotes: Quotes, dirname: bool) -> Self { fn new_c(c: char, quotes: Quotes, dirname: bool) -> Self {
use EscapeState::*; use EscapeState::*;
let init_state = match c { let init_state = match c {
@ -148,7 +194,7 @@ impl EscapedChar {
_ => Char(' '), _ => Char(' '),
}, },
':' if dirname => Backslash(':'), ':' if dirname => Backslash(':'),
_ if c.is_ascii_control() => Octal(EscapeOctal::from(c)), _ if c.is_control() => Octal(EscapeOctal::from_char(c)),
_ => Char(c), _ => Char(c),
}; };
Self { state: init_state } Self { state: init_state }
@ -165,11 +211,11 @@ impl EscapedChar {
'\x0B' => Backslash('v'), '\x0B' => Backslash('v'),
'\x0C' => Backslash('f'), '\x0C' => Backslash('f'),
'\r' => Backslash('r'), '\r' => Backslash('r'),
'\x00'..='\x1F' | '\x7F' => Octal(EscapeOctal::from(c)),
'\'' => match quotes { '\'' => match quotes {
Quotes::Single => Backslash('\''), Quotes::Single => Backslash('\''),
_ => Char('\''), _ => Char('\''),
}, },
_ if c.is_control() => Octal(EscapeOctal::from_char(c)),
_ if SPECIAL_SHELL_CHARS.contains(c) => ForceQuote(c), _ if SPECIAL_SHELL_CHARS.contains(c) => ForceQuote(c),
_ => Char(c), _ => Char(c),
}; };
@ -205,11 +251,18 @@ impl Iterator for EscapedChar {
} }
} }
fn shell_without_escape(name: &str, quotes: Quotes, show_control_chars: bool) -> (String, bool) { /// Check whether `bytes` starts with any byte in `pattern`.
let mut must_quote = false; fn bytes_start_with(bytes: &[u8], pattern: &[u8]) -> bool {
let mut escaped_str = String::with_capacity(name.len()); !bytes.is_empty() && pattern.contains(&bytes[0])
}
for c in name.chars() { fn shell_without_escape(name: &[u8], quotes: Quotes, show_control_chars: bool) -> (Vec<u8>, bool) {
let mut must_quote = false;
let mut escaped_str = Vec::with_capacity(name.len());
let mut utf8_buf = vec![0; 4];
for s in name.utf8_chunks() {
for c in s.valid().chars() {
let escaped = { let escaped = {
let ec = EscapedChar::new_shell(c, false, quotes); let ec = EscapedChar::new_shell(c, false, quotes);
if show_control_chars { if show_control_chars {
@ -220,31 +273,39 @@ fn shell_without_escape(name: &str, quotes: Quotes, show_control_chars: bool) ->
}; };
match escaped.state { match escaped.state {
EscapeState::Backslash('\'') => escaped_str.push_str("'\\''"), EscapeState::Backslash('\'') => escaped_str.extend_from_slice(b"'\\''"),
EscapeState::ForceQuote(x) => { EscapeState::ForceQuote(x) => {
must_quote = true; must_quote = true;
escaped_str.push(x); escaped_str.extend_from_slice(x.encode_utf8(&mut utf8_buf).as_bytes());
} }
_ => { _ => {
for char in escaped { for c in escaped {
escaped_str.push(char); escaped_str.extend_from_slice(c.encode_utf8(&mut utf8_buf).as_bytes());
} }
} }
} }
} }
must_quote = must_quote || name.starts_with(SPECIAL_SHELL_CHARS_START); if show_control_chars {
escaped_str.extend_from_slice(s.invalid());
} else {
escaped_str.resize(escaped_str.len() + s.invalid().len(), b'?');
}
}
must_quote = must_quote || bytes_start_with(name, SPECIAL_SHELL_CHARS_START);
(escaped_str, must_quote) (escaped_str, must_quote)
} }
fn shell_with_escape(name: &str, quotes: Quotes) -> (String, bool) { fn shell_with_escape(name: &[u8], quotes: Quotes) -> (Vec<u8>, bool) {
// We need to keep track of whether we are in a dollar expression // We need to keep track of whether we are in a dollar expression
// because e.g. \b\n is escaped as $'\b\n' and not like $'b'$'n' // because e.g. \b\n is escaped as $'\b\n' and not like $'b'$'n'
let mut in_dollar = false; let mut in_dollar = false;
let mut must_quote = false; let mut must_quote = false;
let mut escaped_str = String::with_capacity(name.len()); let mut escaped_str = String::with_capacity(name.len());
for c in name.chars() { for s in name.utf8_chunks() {
for c in s.valid().chars() {
let escaped = EscapedChar::new_shell(c, true, quotes); let escaped = EscapedChar::new_shell(c, true, quotes);
match escaped.state { match escaped.state {
EscapeState::Char(x) => { EscapeState::Char(x) => {
@ -282,25 +343,32 @@ fn shell_with_escape(name: &str, quotes: Quotes) -> (String, bool) {
} }
} }
} }
must_quote = must_quote || name.starts_with(SPECIAL_SHELL_CHARS_START); if !s.invalid().is_empty() {
(escaped_str, must_quote) if !in_dollar {
escaped_str.push_str("'$'");
in_dollar = true;
}
must_quote = true;
let escaped_bytes: String = s
.invalid()
.iter()
.flat_map(|b| EscapedChar::new_octal(*b))
.collect();
escaped_str.push_str(&escaped_bytes);
}
}
must_quote = must_quote || bytes_start_with(name, SPECIAL_SHELL_CHARS_START);
(escaped_str.into(), must_quote)
} }
/// Return a set of characters that implies quoting of the word in /// Return a set of characters that implies quoting of the word in
/// shell-quoting mode. /// shell-quoting mode.
fn shell_escaped_char_set(is_dirname: bool) -> &'static [char] { fn shell_escaped_char_set(is_dirname: bool) -> &'static [u8] {
const ESCAPED_CHARS: &[char] = &[ const ESCAPED_CHARS: &[u8] = b":\"`$\\^\n\t\r=";
// the ':' colon character only induce quoting in the // the ':' colon character only induce quoting in the
// context of ls displaying a directory name before listing its content. // context of ls displaying a directory name before listing its content.
// (e.g. with the recursive flag -R) // (e.g. with the recursive flag -R)
':',
// Under this line are the control characters that should be
// quoted in shell mode in all cases.
'"', '`', '$', '\\', '^', '\n', '\t', '\r', '=',
];
let start_index = if is_dirname { 0 } else { 1 }; let start_index = if is_dirname { 0 } else { 1 };
&ESCAPED_CHARS[start_index..] &ESCAPED_CHARS[start_index..]
} }
@ -308,41 +376,57 @@ fn shell_escaped_char_set(is_dirname: bool) -> &'static [char] {
/// ///
/// This inner function provides an additional flag `dirname` which /// This inner function provides an additional flag `dirname` which
/// is meant for ls' directory name display. /// is meant for ls' directory name display.
fn escape_name_inner(name: &OsStr, style: &QuotingStyle, dirname: bool) -> String { fn escape_name_inner(name: &[u8], style: &QuotingStyle, dirname: bool) -> Vec<u8> {
match style { match style {
QuotingStyle::Literal { show_control } => { QuotingStyle::Literal { show_control } => {
if *show_control { if *show_control {
name.to_string_lossy().into_owned() name.to_owned()
} else { } else {
name.to_string_lossy() name.utf8_chunks()
.map(|s| {
let valid: String = s
.valid()
.chars() .chars()
.flat_map(|c| EscapedChar::new_literal(c).hide_control()) .flat_map(|c| EscapedChar::new_literal(c).hide_control())
.collect() .collect();
let invalid = "?".repeat(s.invalid().len());
valid + &invalid
})
.collect::<String>()
.into()
} }
} }
QuotingStyle::C { quotes } => { QuotingStyle::C { quotes } => {
let escaped_str: String = name let escaped_str: String = name
.to_string_lossy() .utf8_chunks()
.flat_map(|s| {
let valid = s
.valid()
.chars() .chars()
.flat_map(|c| EscapedChar::new_c(c, *quotes, dirname)) .flat_map(|c| EscapedChar::new_c(c, *quotes, dirname));
.collect(); let invalid = s.invalid().iter().flat_map(|b| EscapedChar::new_octal(*b));
valid.chain(invalid)
})
.collect::<String>();
match quotes { match quotes {
Quotes::Single => format!("'{escaped_str}'"), Quotes::Single => format!("'{escaped_str}'"),
Quotes::Double => format!("\"{escaped_str}\""), Quotes::Double => format!("\"{escaped_str}\""),
Quotes::None => escaped_str, Quotes::None => escaped_str,
} }
.into()
} }
QuotingStyle::Shell { QuotingStyle::Shell {
escape, escape,
always_quote, always_quote,
show_control, show_control,
} => { } => {
let name = name.to_string_lossy(); let (quotes, must_quote) = if name
.iter()
let (quotes, must_quote) = if name.contains(shell_escaped_char_set(dirname)) { .any(|c| shell_escaped_char_set(dirname).contains(c))
{
(Quotes::Single, true) (Quotes::Single, true)
} else if name.contains('\'') { } else if name.contains(&b'\'') {
(Quotes::Double, true) (Quotes::Double, true)
} else if *always_quote { } else if *always_quote {
(Quotes::Single, true) (Quotes::Single, true)
@ -351,30 +435,43 @@ fn escape_name_inner(name: &OsStr, style: &QuotingStyle, dirname: bool) -> Strin
}; };
let (escaped_str, contains_quote_chars) = if *escape { let (escaped_str, contains_quote_chars) = if *escape {
shell_with_escape(&name, quotes) shell_with_escape(name, quotes)
} else { } else {
shell_without_escape(&name, quotes, *show_control) shell_without_escape(name, quotes, *show_control)
}; };
match (must_quote | contains_quote_chars, quotes) { if must_quote | contains_quote_chars && quotes != Quotes::None {
(true, Quotes::Single) => format!("'{escaped_str}'"), let mut quoted_str = Vec::<u8>::with_capacity(escaped_str.len() + 2);
(true, Quotes::Double) => format!("\"{escaped_str}\""), let quote = if quotes == Quotes::Single {
_ => escaped_str, b'\''
} else {
b'"'
};
quoted_str.push(quote);
quoted_str.extend(escaped_str);
quoted_str.push(quote);
quoted_str
} else {
escaped_str
} }
} }
} }
} }
/// Escape a filename with respect to the given style. /// Escape a filename with respect to the given style.
pub fn escape_name(name: &OsStr, style: &QuotingStyle) -> String { pub fn escape_name(name: &OsStr, style: &QuotingStyle) -> OsString {
escape_name_inner(name, style, false) let name = crate::os_str_as_bytes_lossy(name);
crate::os_string_from_vec(escape_name_inner(&name, style, false))
.expect("all byte sequences should be valid for platform, or already replaced in name")
} }
/// Escape a directory name with respect to the given style. /// Escape a directory name with respect to the given style.
/// This is mainly meant to be used for ls' directory name printing and is not /// This is mainly meant to be used for ls' directory name printing and is not
/// likely to be used elsewhere. /// likely to be used elsewhere.
pub fn escape_dir_name(dir_name: &OsStr, style: &QuotingStyle) -> String { pub fn escape_dir_name(dir_name: &OsStr, style: &QuotingStyle) -> OsString {
escape_name_inner(dir_name, style, true) let name = crate::os_str_as_bytes_lossy(dir_name);
crate::os_string_from_vec(escape_name_inner(&name, style, true))
.expect("all byte sequences should be valid for platform, or already replaced in name")
} }
impl fmt::Display for QuotingStyle { impl fmt::Display for QuotingStyle {
@ -415,7 +512,7 @@ impl fmt::Display for Quotes {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::quoting_style::{escape_name, Quotes, QuotingStyle}; use crate::quoting_style::{escape_name_inner, Quotes, QuotingStyle};
// spell-checker:ignore (tests/words) one\'two one'two // spell-checker:ignore (tests/words) one\'two one'two
@ -465,14 +562,31 @@ mod tests {
} }
} }
fn check_names_inner<T>(name: &[u8], map: &[(T, &str)]) -> Vec<Vec<u8>> {
map.iter()
.map(|(_, style)| escape_name_inner(name, &get_style(style), false))
.collect()
}
fn check_names(name: &str, map: &[(&str, &str)]) { fn check_names(name: &str, map: &[(&str, &str)]) {
assert_eq!( assert_eq!(
map.iter() map.iter()
.map(|(_, style)| escape_name(name.as_ref(), &get_style(style))) .map(|(correct, _)| *correct)
.collect::<Vec<String>>(), .collect::<Vec<&str>>(),
check_names_inner(name.as_bytes(), map)
.iter()
.map(|bytes| std::str::from_utf8(bytes)
.expect("valid str goes in, valid str comes out"))
.collect::<Vec<&str>>()
);
}
fn check_names_raw(name: &[u8], map: &[(&[u8], &str)]) {
assert_eq!(
map.iter() map.iter()
.map(|(correct, _)| correct.to_string()) .map(|(correct, _)| *correct)
.collect::<Vec<String>>() .collect::<Vec<&[u8]>>(),
check_names_inner(name, map)
); );
} }
@ -487,10 +601,10 @@ mod tests {
("\"one_two\"", "c"), ("\"one_two\"", "c"),
("one_two", "shell"), ("one_two", "shell"),
("one_two", "shell-show"), ("one_two", "shell-show"),
("\'one_two\'", "shell-always"), ("'one_two'", "shell-always"),
("\'one_two\'", "shell-always-show"), ("'one_two'", "shell-always-show"),
("one_two", "shell-escape"), ("one_two", "shell-escape"),
("\'one_two\'", "shell-escape-always"), ("'one_two'", "shell-escape-always"),
], ],
); );
} }
@ -504,12 +618,12 @@ mod tests {
("one two", "literal-show"), ("one two", "literal-show"),
("one\\ two", "escape"), ("one\\ two", "escape"),
("\"one two\"", "c"), ("\"one two\"", "c"),
("\'one two\'", "shell"), ("'one two'", "shell"),
("\'one two\'", "shell-show"), ("'one two'", "shell-show"),
("\'one two\'", "shell-always"), ("'one two'", "shell-always"),
("\'one two\'", "shell-always-show"), ("'one two'", "shell-always-show"),
("\'one two\'", "shell-escape"), ("'one two'", "shell-escape"),
("\'one two\'", "shell-escape-always"), ("'one two'", "shell-escape-always"),
], ],
); );
@ -551,7 +665,7 @@ mod tests {
// One single quote // One single quote
check_names( check_names(
"one\'two", "one'two",
&[ &[
("one'two", "literal"), ("one'two", "literal"),
("one'two", "literal-show"), ("one'two", "literal-show"),
@ -637,7 +751,7 @@ mod tests {
], ],
); );
// The first 16 control characters. NUL is also included, even though it is of // The first 16 ASCII control characters. NUL is also included, even though it is of
// no importance for file names. // no importance for file names.
check_names( check_names(
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F", "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F",
@ -676,7 +790,7 @@ mod tests {
], ],
); );
// The last 16 control characters. // The last 16 ASCII control characters.
check_names( check_names(
"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F", "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
&[ &[
@ -730,6 +844,265 @@ mod tests {
("''$'\\177'", "shell-escape-always"), ("''$'\\177'", "shell-escape-always"),
], ],
); );
// The first 16 Unicode control characters.
let test_str = std::str::from_utf8(b"\xC2\x80\xC2\x81\xC2\x82\xC2\x83\xC2\x84\xC2\x85\xC2\x86\xC2\x87\xC2\x88\xC2\x89\xC2\x8A\xC2\x8B\xC2\x8C\xC2\x8D\xC2\x8E\xC2\x8F").unwrap();
check_names(
test_str,
&[
("????????????????", "literal"),
(test_str, "literal-show"),
("\\302\\200\\302\\201\\302\\202\\302\\203\\302\\204\\302\\205\\302\\206\\302\\207\\302\\210\\302\\211\\302\\212\\302\\213\\302\\214\\302\\215\\302\\216\\302\\217", "escape"),
("\"\\302\\200\\302\\201\\302\\202\\302\\203\\302\\204\\302\\205\\302\\206\\302\\207\\302\\210\\302\\211\\302\\212\\302\\213\\302\\214\\302\\215\\302\\216\\302\\217\"", "c"),
("????????????????", "shell"),
(test_str, "shell-show"),
("'????????????????'", "shell-always"),
(&format!("'{}'", test_str), "shell-always-show"),
("''$'\\302\\200\\302\\201\\302\\202\\302\\203\\302\\204\\302\\205\\302\\206\\302\\207\\302\\210\\302\\211\\302\\212\\302\\213\\302\\214\\302\\215\\302\\216\\302\\217'", "shell-escape"),
("''$'\\302\\200\\302\\201\\302\\202\\302\\203\\302\\204\\302\\205\\302\\206\\302\\207\\302\\210\\302\\211\\302\\212\\302\\213\\302\\214\\302\\215\\302\\216\\302\\217'", "shell-escape-always"),
],
);
// The last 16 Unicode control characters.
let test_str = std::str::from_utf8(b"\xC2\x90\xC2\x91\xC2\x92\xC2\x93\xC2\x94\xC2\x95\xC2\x96\xC2\x97\xC2\x98\xC2\x99\xC2\x9A\xC2\x9B\xC2\x9C\xC2\x9D\xC2\x9E\xC2\x9F").unwrap();
check_names(
test_str,
&[
("????????????????", "literal"),
(test_str, "literal-show"),
("\\302\\220\\302\\221\\302\\222\\302\\223\\302\\224\\302\\225\\302\\226\\302\\227\\302\\230\\302\\231\\302\\232\\302\\233\\302\\234\\302\\235\\302\\236\\302\\237", "escape"),
("\"\\302\\220\\302\\221\\302\\222\\302\\223\\302\\224\\302\\225\\302\\226\\302\\227\\302\\230\\302\\231\\302\\232\\302\\233\\302\\234\\302\\235\\302\\236\\302\\237\"", "c"),
("????????????????", "shell"),
(test_str, "shell-show"),
("'????????????????'", "shell-always"),
(&format!("'{}'", test_str), "shell-always-show"),
("''$'\\302\\220\\302\\221\\302\\222\\302\\223\\302\\224\\302\\225\\302\\226\\302\\227\\302\\230\\302\\231\\302\\232\\302\\233\\302\\234\\302\\235\\302\\236\\302\\237'", "shell-escape"),
("''$'\\302\\220\\302\\221\\302\\222\\302\\223\\302\\224\\302\\225\\302\\226\\302\\227\\302\\230\\302\\231\\302\\232\\302\\233\\302\\234\\302\\235\\302\\236\\302\\237'", "shell-escape-always"),
],
);
}
#[test]
fn test_non_unicode_bytes() {
let ascii = b'_';
let continuation = b'\xA7';
let first2byte = b'\xC2';
let first3byte = b'\xE0';
let first4byte = b'\xF0';
let invalid = b'\xC0';
// a single byte value invalid outside of additional context in UTF-8
check_names_raw(
&[continuation],
&[
(b"?", "literal"),
(b"\xA7", "literal-show"),
(b"\\247", "escape"),
(b"\"\\247\"", "c"),
(b"?", "shell"),
(b"\xA7", "shell-show"),
(b"'?'", "shell-always"),
(b"'\xA7'", "shell-always-show"),
(b"''$'\\247'", "shell-escape"),
(b"''$'\\247'", "shell-escape-always"),
],
);
// ...but the byte becomes valid with appropriate context
// (this is just the § character in UTF-8, written as bytes)
check_names_raw(
&[first2byte, continuation],
&[
(b"\xC2\xA7", "literal"),
(b"\xC2\xA7", "literal-show"),
(b"\xC2\xA7", "escape"),
(b"\"\xC2\xA7\"", "c"),
(b"\xC2\xA7", "shell"),
(b"\xC2\xA7", "shell-show"),
(b"'\xC2\xA7'", "shell-always"),
(b"'\xC2\xA7'", "shell-always-show"),
(b"\xC2\xA7", "shell-escape"),
(b"'\xC2\xA7'", "shell-escape-always"),
],
);
// mixed with valid characters
check_names_raw(
&[continuation, ascii],
&[
(b"?_", "literal"),
(b"\xA7_", "literal-show"),
(b"\\247_", "escape"),
(b"\"\\247_\"", "c"),
(b"?_", "shell"),
(b"\xA7_", "shell-show"),
(b"'?_'", "shell-always"),
(b"'\xA7_'", "shell-always-show"),
(b"''$'\\247''_'", "shell-escape"),
(b"''$'\\247''_'", "shell-escape-always"),
],
);
check_names_raw(
&[ascii, continuation],
&[
(b"_?", "literal"),
(b"_\xA7", "literal-show"),
(b"_\\247", "escape"),
(b"\"_\\247\"", "c"),
(b"_?", "shell"),
(b"_\xA7", "shell-show"),
(b"'_?'", "shell-always"),
(b"'_\xA7'", "shell-always-show"),
(b"'_'$'\\247'", "shell-escape"),
(b"'_'$'\\247'", "shell-escape-always"),
],
);
check_names_raw(
&[ascii, continuation, ascii],
&[
(b"_?_", "literal"),
(b"_\xA7_", "literal-show"),
(b"_\\247_", "escape"),
(b"\"_\\247_\"", "c"),
(b"_?_", "shell"),
(b"_\xA7_", "shell-show"),
(b"'_?_'", "shell-always"),
(b"'_\xA7_'", "shell-always-show"),
(b"'_'$'\\247''_'", "shell-escape"),
(b"'_'$'\\247''_'", "shell-escape-always"),
],
);
check_names_raw(
&[continuation, ascii, continuation],
&[
(b"?_?", "literal"),
(b"\xA7_\xA7", "literal-show"),
(b"\\247_\\247", "escape"),
(b"\"\\247_\\247\"", "c"),
(b"?_?", "shell"),
(b"\xA7_\xA7", "shell-show"),
(b"'?_?'", "shell-always"),
(b"'\xA7_\xA7'", "shell-always-show"),
(b"''$'\\247''_'$'\\247'", "shell-escape"),
(b"''$'\\247''_'$'\\247'", "shell-escape-always"),
],
);
// contiguous invalid bytes
check_names_raw(
&[
ascii,
invalid,
ascii,
continuation,
continuation,
ascii,
continuation,
continuation,
continuation,
ascii,
continuation,
continuation,
continuation,
continuation,
ascii,
],
&[
(b"_?_??_???_????_", "literal"),
(
b"_\xC0_\xA7\xA7_\xA7\xA7\xA7_\xA7\xA7\xA7\xA7_",
"literal-show",
),
(
b"_\\300_\\247\\247_\\247\\247\\247_\\247\\247\\247\\247_",
"escape",
),
(
b"\"_\\300_\\247\\247_\\247\\247\\247_\\247\\247\\247\\247_\"",
"c",
),
(b"_?_??_???_????_", "shell"),
(
b"_\xC0_\xA7\xA7_\xA7\xA7\xA7_\xA7\xA7\xA7\xA7_",
"shell-show",
),
(b"'_?_??_???_????_'", "shell-always"),
(
b"'_\xC0_\xA7\xA7_\xA7\xA7\xA7_\xA7\xA7\xA7\xA7_'",
"shell-always-show",
),
(
b"'_'$'\\300''_'$'\\247\\247''_'$'\\247\\247\\247''_'$'\\247\\247\\247\\247''_'",
"shell-escape",
),
(
b"'_'$'\\300''_'$'\\247\\247''_'$'\\247\\247\\247''_'$'\\247\\247\\247\\247''_'",
"shell-escape-always",
),
],
);
// invalid multi-byte sequences that start valid
check_names_raw(
&[first2byte, ascii],
&[
(b"?_", "literal"),
(b"\xC2_", "literal-show"),
(b"\\302_", "escape"),
(b"\"\\302_\"", "c"),
(b"?_", "shell"),
(b"\xC2_", "shell-show"),
(b"'?_'", "shell-always"),
(b"'\xC2_'", "shell-always-show"),
(b"''$'\\302''_'", "shell-escape"),
(b"''$'\\302''_'", "shell-escape-always"),
],
);
check_names_raw(
&[first2byte, first2byte, continuation],
&[
(b"?\xC2\xA7", "literal"),
(b"\xC2\xC2\xA7", "literal-show"),
(b"\\302\xC2\xA7", "escape"),
(b"\"\\302\xC2\xA7\"", "c"),
(b"?\xC2\xA7", "shell"),
(b"\xC2\xC2\xA7", "shell-show"),
(b"'?\xC2\xA7'", "shell-always"),
(b"'\xC2\xC2\xA7'", "shell-always-show"),
(b"''$'\\302''\xC2\xA7'", "shell-escape"),
(b"''$'\\302''\xC2\xA7'", "shell-escape-always"),
],
);
check_names_raw(
&[first3byte, continuation, ascii],
&[
(b"??_", "literal"),
(b"\xE0\xA7_", "literal-show"),
(b"\\340\\247_", "escape"),
(b"\"\\340\\247_\"", "c"),
(b"??_", "shell"),
(b"\xE0\xA7_", "shell-show"),
(b"'??_'", "shell-always"),
(b"'\xE0\xA7_'", "shell-always-show"),
(b"''$'\\340\\247''_'", "shell-escape"),
(b"''$'\\340\\247''_'", "shell-escape-always"),
],
);
check_names_raw(
&[first4byte, continuation, continuation, ascii],
&[
(b"???_", "literal"),
(b"\xF0\xA7\xA7_", "literal-show"),
(b"\\360\\247\\247_", "escape"),
(b"\"\\360\\247\\247_\"", "c"),
(b"???_", "shell"),
(b"\xF0\xA7\xA7_", "shell-show"),
(b"'???_'", "shell-always"),
(b"'\xF0\xA7\xA7_'", "shell-always-show"),
(b"''$'\\360\\247\\247''_'", "shell-escape"),
(b"''$'\\360\\247\\247''_'", "shell-escape-always"),
],
);
} }
#[test] #[test]
@ -765,7 +1138,7 @@ mod tests {
("one\\\\two", "escape"), ("one\\\\two", "escape"),
("\"one\\\\two\"", "c"), ("\"one\\\\two\"", "c"),
("'one\\two'", "shell"), ("'one\\two'", "shell"),
("\'one\\two\'", "shell-always"), ("'one\\two'", "shell-always"),
("'one\\two'", "shell-escape"), ("'one\\two'", "shell-escape"),
("'one\\two'", "shell-escape-always"), ("'one\\two'", "shell-escape-always"),
], ],

View file

@ -255,9 +255,10 @@ pub fn read_yes() -> bool {
} }
} }
/// Helper function for processing delimiter values (which could be non UTF-8) /// Converts an `OsStr` to a UTF-8 `&[u8]`.
/// It converts OsString to &[u8] for unix targets only ///
/// On non-unix (i.e. Windows) it will just return an error if delimiter value is not UTF-8 /// This always succeeds on unix platforms,
/// and fails on other platforms if the string can't be coerced to UTF-8.
pub fn os_str_as_bytes(os_string: &OsStr) -> mods::error::UResult<&[u8]> { pub fn os_str_as_bytes(os_string: &OsStr) -> mods::error::UResult<&[u8]> {
#[cfg(unix)] #[cfg(unix)]
let bytes = os_string.as_bytes(); let bytes = os_string.as_bytes();
@ -273,13 +274,28 @@ pub fn os_str_as_bytes(os_string: &OsStr) -> mods::error::UResult<&[u8]> {
Ok(bytes) Ok(bytes)
} }
/// Helper function for converting a slice of bytes into an &OsStr /// Performs a potentially lossy conversion from `OsStr` to UTF-8 bytes.
/// or OsString in non-unix targets.
/// ///
/// It converts `&[u8]` to `Cow<OsStr>` for unix targets only. /// This is always lossless on unix platforms,
/// On non-unix (i.e. Windows), the conversion goes through the String type /// and wraps [`OsStr::to_string_lossy`] on non-unix platforms.
/// and thus undergo UTF-8 validation, making it fail if the stream contains pub fn os_str_as_bytes_lossy(os_string: &OsStr) -> Cow<[u8]> {
/// non-UTF-8 characters. #[cfg(unix)]
let bytes = Cow::from(os_string.as_bytes());
#[cfg(not(unix))]
let bytes = match os_string.to_string_lossy() {
Cow::Borrowed(slice) => Cow::from(slice.as_bytes()),
Cow::Owned(owned) => Cow::from(owned.into_bytes()),
};
bytes
}
/// Converts a `&[u8]` to an `&OsStr`,
/// or parses it as UTF-8 into an [`OsString`] on non-unix platforms.
///
/// This always succeeds on unix platforms,
/// and fails on other platforms if the bytes can't be parsed as UTF-8.
pub fn os_str_from_bytes(bytes: &[u8]) -> mods::error::UResult<Cow<'_, OsStr>> { pub fn os_str_from_bytes(bytes: &[u8]) -> mods::error::UResult<Cow<'_, OsStr>> {
#[cfg(unix)] #[cfg(unix)]
let os_str = Cow::Borrowed(OsStr::from_bytes(bytes)); let os_str = Cow::Borrowed(OsStr::from_bytes(bytes));
@ -291,9 +307,10 @@ pub fn os_str_from_bytes(bytes: &[u8]) -> mods::error::UResult<Cow<'_, OsStr>> {
Ok(os_str) Ok(os_str)
} }
/// Helper function for making an `OsString` from a byte field /// Converts a `Vec<u8>` into an `OsString`, parsing as UTF-8 on non-unix platforms.
/// It converts `Vec<u8>` to `OsString` for unix targets only. ///
/// On non-unix (i.e. Windows) it may fail if the bytes are not valid UTF-8 /// This always succeeds on unix platforms,
/// and fails on other platforms if the bytes can't be parsed as UTF-8.
pub fn os_string_from_vec(vec: Vec<u8>) -> mods::error::UResult<OsString> { pub fn os_string_from_vec(vec: Vec<u8>) -> mods::error::UResult<OsString> {
#[cfg(unix)] #[cfg(unix)]
let s = OsString::from_vec(vec); let s = OsString::from_vec(vec);