1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-28 11:37:44 +00:00

Merge pull request #6882 from jtracey/quoting_style_bytes

quoting_style: Add support for non-UTF-8 bytes
This commit is contained in:
Sylvestre Ledru 2024-12-21 23:17:43 +01:00 committed by GitHub
commit bb2fb66073
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 584 additions and 175 deletions

View file

@ -1,4 +1,4 @@
msrv = "1.77.0"
msrv = "1.79.0"
cognitive-complexity-threshold = 24
missing-docs-in-crate-items = true
check-private-items = true

View file

@ -11,7 +11,7 @@ env:
PROJECT_NAME: coreutils
PROJECT_DESC: "Core universal (cross-platform) utilities"
PROJECT_AUTH: "uutils"
RUST_MIN_SRV: "1.77.0"
RUST_MIN_SRV: "1.79.0"
# * style job configuration
STYLE_FAIL_ON_FAULT: true ## (bool) fail the build if a style job contains a fault (error or warning); may be overridden on a per-job basis

View file

@ -16,7 +16,7 @@ repository = "https://github.com/uutils/coreutils"
readme = "README.md"
keywords = ["coreutils", "uutils", "cross-platform", "cli", "utility"]
categories = ["command-line-utilities"]
rust-version = "1.77.0"
rust-version = "1.79.0"
edition = "2021"
build = "build.rs"

View file

@ -14,7 +14,7 @@
[![dependency status](https://deps.rs/repo/github/uutils/coreutils/status.svg)](https://deps.rs/repo/github/uutils/coreutils)
[![CodeCov](https://codecov.io/gh/uutils/coreutils/branch/master/graph/badge.svg)](https://codecov.io/gh/uutils/coreutils)
![MSRV](https://img.shields.io/badge/MSRV-1.77.0-brightgreen)
![MSRV](https://img.shields.io/badge/MSRV-1.79.0-brightgreen)
</div>
@ -70,7 +70,7 @@ the [coreutils docs](https://github.com/uutils/uutils.github.io) repository.
### Rust Version
uutils follows Rust's release channels and is tested against stable, beta and
nightly. The current Minimum Supported Rust Version (MSRV) is `1.77.0`.
nightly. The current Minimum Supported Rust Version (MSRV) is `1.79.0`.
## Building

View file

@ -21,7 +21,7 @@ use std::os::windows::fs::MetadataExt;
use std::{
cmp::Reverse,
error::Error,
ffi::OsString,
ffi::{OsStr, OsString},
fmt::{Display, Write as FmtWrite},
fs::{self, DirEntry, FileType, Metadata, ReadDir},
io::{stdout, BufWriter, ErrorKind, Stdout, Write},
@ -55,7 +55,7 @@ use uucore::libc::{dev_t, major, minor};
#[cfg(unix)]
use uucore::libc::{S_IXGRP, S_IXOTH, S_IXUSR};
use uucore::line_ending::LineEnding;
use uucore::quoting_style::{escape_dir_name, escape_name, QuotingStyle};
use uucore::quoting_style::{self, QuotingStyle};
use uucore::{
display::Quotable,
error::{set_exit_code, UError, UResult},
@ -2048,7 +2048,11 @@ impl PathData {
/// file11
/// ```
fn show_dir_name(path_data: &PathData, out: &mut BufWriter<Stdout>, config: &Config) {
let escaped_name = escape_dir_name(path_data.p_buf.as_os_str(), &config.quoting_style);
// FIXME: replace this with appropriate behavior for literal unprintable bytes
let escaped_name =
quoting_style::escape_dir_name(path_data.p_buf.as_os_str(), &config.quoting_style)
.to_string_lossy()
.to_string();
let name = if config.hyperlink && !config.dired {
create_hyperlink(&escaped_name, path_data)
@ -3002,7 +3006,6 @@ use std::sync::Mutex;
#[cfg(unix)]
use uucore::entries;
use uucore::fs::FileInformation;
use uucore::quoting_style;
#[cfg(unix)]
fn cached_uid2usr(uid: u32) -> String {
@ -3542,3 +3545,10 @@ fn calculate_padding_collection(
padding_collections
}
// FIXME: replace this with appropriate behavior for literal unprintable bytes
fn escape_name(name: &OsStr, style: &QuotingStyle) -> String {
quoting_style::escape_name(name, style)
.to_string_lossy()
.to_string()
}

View file

@ -13,7 +13,7 @@ mod word_count;
use std::{
borrow::{Borrow, Cow},
cmp::max,
ffi::OsString,
ffi::{OsStr, OsString},
fs::{self, File},
io::{self, Write},
iter,
@ -28,7 +28,7 @@ use utf8::{BufReadDecoder, BufReadDecoderError};
use uucore::{
error::{FromIo, UError, UResult},
format_usage, help_about, help_usage,
quoting_style::{escape_name, QuotingStyle},
quoting_style::{self, QuotingStyle},
shortcut_value_parser::ShortcutValueParser,
show,
};
@ -259,7 +259,7 @@ impl<'a> Input<'a> {
match self {
Self::Path(path) => Some(match path.to_str() {
Some(s) if !s.contains('\n') => Cow::Borrowed(s),
_ => Cow::Owned(escape_name(path.as_os_str(), QS_ESCAPE)),
_ => Cow::Owned(escape_name_wrapper(path.as_os_str())),
}),
Self::Stdin(StdinKind::Explicit) => Some(Cow::Borrowed(STDIN_REPR)),
Self::Stdin(StdinKind::Implicit) => None,
@ -269,7 +269,7 @@ impl<'a> Input<'a> {
/// Converts input into the form that appears in errors.
fn path_display(&self) -> String {
match self {
Self::Path(path) => escape_name(path.as_os_str(), QS_ESCAPE),
Self::Path(path) => escape_name_wrapper(path.as_os_str()),
Self::Stdin(_) => String::from("standard input"),
}
}
@ -361,7 +361,7 @@ impl WcError {
Some((input, idx)) => {
let path = match input {
Input::Stdin(_) => STDIN_REPR.into(),
Input::Path(path) => escape_name(path.as_os_str(), QS_ESCAPE).into(),
Input::Path(path) => escape_name_wrapper(path.as_os_str()).into(),
};
Self::ZeroLengthFileNameCtx { path, idx }
}
@ -761,7 +761,9 @@ fn files0_iter_file<'a>(path: &Path) -> UResult<impl Iterator<Item = InputIterIt
Err(e) => Err(e.map_err_context(|| {
format!(
"cannot open {} for reading",
escape_name(path.as_os_str(), QS_QUOTE_ESCAPE)
quoting_style::escape_name(path.as_os_str(), QS_QUOTE_ESCAPE)
.into_string()
.expect("All escaped names with the escaping option return valid strings.")
)
})),
}
@ -793,9 +795,9 @@ fn files0_iter<'a>(
Ok(Input::Path(PathBuf::from(s).into()))
}
}
Err(e) => Err(e.map_err_context(|| {
format!("{}: read error", escape_name(&err_path, QS_ESCAPE))
}) as Box<dyn UError>),
Err(e) => Err(e
.map_err_context(|| format!("{}: read error", escape_name_wrapper(&err_path)))
as Box<dyn UError>),
}),
);
// Loop until there is an error; yield that error and then nothing else.
@ -808,6 +810,12 @@ fn files0_iter<'a>(
})
}
fn escape_name_wrapper(name: &OsStr) -> String {
quoting_style::escape_name(name, QS_ESCAPE)
.into_string()
.expect("All escaped names with the escaping option return valid strings.")
}
fn wc(inputs: &Inputs, settings: &Settings) -> UResult<()> {
let mut total_word_count = WordCount::default();
let mut num_inputs: usize = 0;

View file

@ -112,7 +112,8 @@ fn extract_value<T: Default>(p: Result<T, ParseError<'_, T>>, input: &str) -> T
Default::default()
}
ParseError::PartialMatch(v, rest) => {
if input.starts_with('\'') {
let bytes = input.as_encoded_bytes();
if !bytes.is_empty() && bytes[0] == b'\'' {
show_warning!(
"{}: character(s) following character constant have been ignored",
&rest,

View file

@ -353,20 +353,20 @@ impl Spec {
writer.write_all(&parsed).map_err(FormatError::IoError)
}
Self::QuotedString => {
let s = args.get_str();
writer
.write_all(
escape_name(
s.as_ref(),
&QuotingStyle::Shell {
escape: true,
always_quote: false,
show_control: false,
},
)
.as_bytes(),
)
.map_err(FormatError::IoError)
let s = escape_name(
args.get_str().as_ref(),
&QuotingStyle::Shell {
escape: true,
always_quote: false,
show_control: false,
},
);
#[cfg(unix)]
let bytes = std::os::unix::ffi::OsStringExt::into_vec(s);
#[cfg(not(unix))]
let bytes = s.to_string_lossy().as_bytes().to_owned();
writer.write_all(&bytes).map_err(FormatError::IoError)
}
Self::SignedInt {
width,

View file

@ -6,39 +6,43 @@
//! Set of functions for escaping names according to different quoting styles.
use std::char::from_digit;
use std::ffi::OsStr;
use std::ffi::{OsStr, OsString};
use std::fmt;
// These are characters with special meaning in the shell (e.g. bash).
// The first const contains characters that only have a special meaning when they appear at the beginning of a name.
const SPECIAL_SHELL_CHARS_START: &[char] = &['~', '#'];
const SPECIAL_SHELL_CHARS_START: &[u8] = b"~#";
// PR#6559 : Remove `]{}` from special shell chars.
const SPECIAL_SHELL_CHARS: &str = "`$&*()|[;\\'\"<>?! ";
/// The quoting style to use when escaping a name.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum QuotingStyle {
/// Escape the name as a literal string.
/// Escape the name as a shell string.
/// Used in, e.g., `ls --quoting-style=shell`.
Shell {
/// Whether to escape characters in the name.
/// True in, e.g., `ls --quoting-style=shell-escape`.
escape: bool,
/// Whether to always quote the name.
always_quote: bool,
/// Whether to show control characters.
/// Whether to show control and non-unicode characters, or replace them with `?`.
show_control: bool,
},
/// Escape the name as a C string.
/// Used in, e.g., `ls --quote-name`.
C {
/// The type of quotes to use.
quotes: Quotes,
},
/// Escape the name as a literal string.
/// Do not escape the string.
/// Used in, e.g., `ls --literal`.
Literal {
/// Whether to show control characters.
/// Whether to show control and non-unicode characters, or replace them with `?`.
show_control: bool,
},
}
@ -72,16 +76,24 @@ enum EscapeState {
Octal(EscapeOctal),
}
/// Bytes we need to present as escaped octal, in the form of `\nnn` per byte.
/// Only supports characters up to 2 bytes long in UTF-8.
struct EscapeOctal {
c: char,
c: [u8; 2],
state: EscapeOctalState,
idx: usize,
idx: u8,
}
enum EscapeOctalState {
Done,
Backslash,
Value,
FirstBackslash,
FirstValue,
LastBackslash,
LastValue,
}
fn byte_to_octal_digit(byte: u8, idx: u8) -> u8 {
(byte >> (idx * 3)) & 0o7
}
impl Iterator for EscapeOctal {
@ -90,29 +102,57 @@ impl Iterator for EscapeOctal {
fn next(&mut self) -> Option<char> {
match self.state {
EscapeOctalState::Done => None,
EscapeOctalState::Backslash => {
self.state = EscapeOctalState::Value;
EscapeOctalState::FirstBackslash => {
self.state = EscapeOctalState::FirstValue;
Some('\\')
}
EscapeOctalState::Value => {
let octal_digit = ((self.c as u32) >> (self.idx * 3)) & 0o7;
EscapeOctalState::LastBackslash => {
self.state = EscapeOctalState::LastValue;
Some('\\')
}
EscapeOctalState::FirstValue => {
let octal_digit = byte_to_octal_digit(self.c[0], self.idx);
if self.idx == 0 {
self.state = EscapeOctalState::LastBackslash;
self.idx = 2;
} else {
self.idx -= 1;
}
Some(from_digit(octal_digit.into(), 8).unwrap())
}
EscapeOctalState::LastValue => {
let octal_digit = byte_to_octal_digit(self.c[1], self.idx);
if self.idx == 0 {
self.state = EscapeOctalState::Done;
} else {
self.idx -= 1;
}
Some(from_digit(octal_digit, 8).unwrap())
Some(from_digit(octal_digit.into(), 8).unwrap())
}
}
}
}
impl EscapeOctal {
fn from(c: char) -> Self {
fn from_char(c: char) -> Self {
if c.len_utf8() == 1 {
return Self::from_byte(c as u8);
}
let mut buf = [0; 2];
let _s = c.encode_utf8(&mut buf);
Self {
c,
c: buf,
idx: 2,
state: EscapeOctalState::Backslash,
state: EscapeOctalState::FirstBackslash,
}
}
fn from_byte(b: u8) -> Self {
Self {
c: [0, b],
idx: 2,
state: EscapeOctalState::LastBackslash,
}
}
}
@ -124,6 +164,12 @@ impl EscapedChar {
}
}
fn new_octal(b: u8) -> Self {
Self {
state: EscapeState::Octal(EscapeOctal::from_byte(b)),
}
}
fn new_c(c: char, quotes: Quotes, dirname: bool) -> Self {
use EscapeState::*;
let init_state = match c {
@ -148,7 +194,7 @@ impl EscapedChar {
_ => Char(' '),
},
':' if dirname => Backslash(':'),
_ if c.is_ascii_control() => Octal(EscapeOctal::from(c)),
_ if c.is_control() => Octal(EscapeOctal::from_char(c)),
_ => Char(c),
};
Self { state: init_state }
@ -165,11 +211,11 @@ impl EscapedChar {
'\x0B' => Backslash('v'),
'\x0C' => Backslash('f'),
'\r' => Backslash('r'),
'\x00'..='\x1F' | '\x7F' => Octal(EscapeOctal::from(c)),
'\'' => match quotes {
Quotes::Single => Backslash('\''),
_ => Char('\''),
},
_ if c.is_control() => Octal(EscapeOctal::from_char(c)),
_ if SPECIAL_SHELL_CHARS.contains(c) => ForceQuote(c),
_ => Char(c),
};
@ -205,102 +251,124 @@ impl Iterator for EscapedChar {
}
}
fn shell_without_escape(name: &str, quotes: Quotes, show_control_chars: bool) -> (String, bool) {
/// Check whether `bytes` starts with any byte in `pattern`.
fn bytes_start_with(bytes: &[u8], pattern: &[u8]) -> bool {
!bytes.is_empty() && pattern.contains(&bytes[0])
}
fn shell_without_escape(name: &[u8], quotes: Quotes, show_control_chars: bool) -> (Vec<u8>, bool) {
let mut must_quote = false;
let mut escaped_str = String::with_capacity(name.len());
let mut escaped_str = Vec::with_capacity(name.len());
let mut utf8_buf = vec![0; 4];
for c in name.chars() {
let escaped = {
let ec = EscapedChar::new_shell(c, false, quotes);
if show_control_chars {
ec
} else {
ec.hide_control()
}
};
for s in name.utf8_chunks() {
for c in s.valid().chars() {
let escaped = {
let ec = EscapedChar::new_shell(c, false, quotes);
if show_control_chars {
ec
} else {
ec.hide_control()
}
};
match escaped.state {
EscapeState::Backslash('\'') => escaped_str.push_str("'\\''"),
EscapeState::ForceQuote(x) => {
must_quote = true;
escaped_str.push(x);
}
_ => {
for char in escaped {
escaped_str.push(char);
match escaped.state {
EscapeState::Backslash('\'') => escaped_str.extend_from_slice(b"'\\''"),
EscapeState::ForceQuote(x) => {
must_quote = true;
escaped_str.extend_from_slice(x.encode_utf8(&mut utf8_buf).as_bytes());
}
_ => {
for c in escaped {
escaped_str.extend_from_slice(c.encode_utf8(&mut utf8_buf).as_bytes());
}
}
}
}
if show_control_chars {
escaped_str.extend_from_slice(s.invalid());
} else {
escaped_str.resize(escaped_str.len() + s.invalid().len(), b'?');
}
}
must_quote = must_quote || name.starts_with(SPECIAL_SHELL_CHARS_START);
must_quote = must_quote || bytes_start_with(name, SPECIAL_SHELL_CHARS_START);
(escaped_str, must_quote)
}
fn shell_with_escape(name: &str, quotes: Quotes) -> (String, bool) {
fn shell_with_escape(name: &[u8], quotes: Quotes) -> (Vec<u8>, bool) {
// We need to keep track of whether we are in a dollar expression
// because e.g. \b\n is escaped as $'\b\n' and not like $'b'$'n'
let mut in_dollar = false;
let mut must_quote = false;
let mut escaped_str = String::with_capacity(name.len());
for c in name.chars() {
let escaped = EscapedChar::new_shell(c, true, quotes);
match escaped.state {
EscapeState::Char(x) => {
if in_dollar {
escaped_str.push_str("''");
for s in name.utf8_chunks() {
for c in s.valid().chars() {
let escaped = EscapedChar::new_shell(c, true, quotes);
match escaped.state {
EscapeState::Char(x) => {
if in_dollar {
escaped_str.push_str("''");
in_dollar = false;
}
escaped_str.push(x);
}
EscapeState::ForceQuote(x) => {
if in_dollar {
escaped_str.push_str("''");
in_dollar = false;
}
must_quote = true;
escaped_str.push(x);
}
// Single quotes are not put in dollar expressions, but are escaped
// if the string also contains double quotes. In that case, they must
// be handled separately.
EscapeState::Backslash('\'') => {
must_quote = true;
in_dollar = false;
escaped_str.push_str("'\\''");
}
escaped_str.push(x);
}
EscapeState::ForceQuote(x) => {
if in_dollar {
escaped_str.push_str("''");
in_dollar = false;
}
must_quote = true;
escaped_str.push(x);
}
// Single quotes are not put in dollar expressions, but are escaped
// if the string also contains double quotes. In that case, they must
// be handled separately.
EscapeState::Backslash('\'') => {
must_quote = true;
in_dollar = false;
escaped_str.push_str("'\\''");
}
_ => {
if !in_dollar {
escaped_str.push_str("'$'");
in_dollar = true;
}
must_quote = true;
for char in escaped {
escaped_str.push(char);
_ => {
if !in_dollar {
escaped_str.push_str("'$'");
in_dollar = true;
}
must_quote = true;
for char in escaped {
escaped_str.push(char);
}
}
}
}
if !s.invalid().is_empty() {
if !in_dollar {
escaped_str.push_str("'$'");
in_dollar = true;
}
must_quote = true;
let escaped_bytes: String = s
.invalid()
.iter()
.flat_map(|b| EscapedChar::new_octal(*b))
.collect();
escaped_str.push_str(&escaped_bytes);
}
}
must_quote = must_quote || name.starts_with(SPECIAL_SHELL_CHARS_START);
(escaped_str, must_quote)
must_quote = must_quote || bytes_start_with(name, SPECIAL_SHELL_CHARS_START);
(escaped_str.into(), must_quote)
}
/// Return a set of characters that implies quoting of the word in
/// shell-quoting mode.
fn shell_escaped_char_set(is_dirname: bool) -> &'static [char] {
const ESCAPED_CHARS: &[char] = &[
// the ':' colon character only induce quoting in the
// context of ls displaying a directory name before listing its content.
// (e.g. with the recursive flag -R)
':',
// Under this line are the control characters that should be
// quoted in shell mode in all cases.
'"', '`', '$', '\\', '^', '\n', '\t', '\r', '=',
];
fn shell_escaped_char_set(is_dirname: bool) -> &'static [u8] {
const ESCAPED_CHARS: &[u8] = b":\"`$\\^\n\t\r=";
// the ':' colon character only induce quoting in the
// context of ls displaying a directory name before listing its content.
// (e.g. with the recursive flag -R)
let start_index = if is_dirname { 0 } else { 1 };
&ESCAPED_CHARS[start_index..]
}
@ -308,41 +376,57 @@ fn shell_escaped_char_set(is_dirname: bool) -> &'static [char] {
///
/// This inner function provides an additional flag `dirname` which
/// is meant for ls' directory name display.
fn escape_name_inner(name: &OsStr, style: &QuotingStyle, dirname: bool) -> String {
fn escape_name_inner(name: &[u8], style: &QuotingStyle, dirname: bool) -> Vec<u8> {
match style {
QuotingStyle::Literal { show_control } => {
if *show_control {
name.to_string_lossy().into_owned()
name.to_owned()
} else {
name.to_string_lossy()
.chars()
.flat_map(|c| EscapedChar::new_literal(c).hide_control())
.collect()
name.utf8_chunks()
.map(|s| {
let valid: String = s
.valid()
.chars()
.flat_map(|c| EscapedChar::new_literal(c).hide_control())
.collect();
let invalid = "?".repeat(s.invalid().len());
valid + &invalid
})
.collect::<String>()
.into()
}
}
QuotingStyle::C { quotes } => {
let escaped_str: String = name
.to_string_lossy()
.chars()
.flat_map(|c| EscapedChar::new_c(c, *quotes, dirname))
.collect();
.utf8_chunks()
.flat_map(|s| {
let valid = s
.valid()
.chars()
.flat_map(|c| EscapedChar::new_c(c, *quotes, dirname));
let invalid = s.invalid().iter().flat_map(|b| EscapedChar::new_octal(*b));
valid.chain(invalid)
})
.collect::<String>();
match quotes {
Quotes::Single => format!("'{escaped_str}'"),
Quotes::Double => format!("\"{escaped_str}\""),
Quotes::None => escaped_str,
}
.into()
}
QuotingStyle::Shell {
escape,
always_quote,
show_control,
} => {
let name = name.to_string_lossy();
let (quotes, must_quote) = if name.contains(shell_escaped_char_set(dirname)) {
let (quotes, must_quote) = if name
.iter()
.any(|c| shell_escaped_char_set(dirname).contains(c))
{
(Quotes::Single, true)
} else if name.contains('\'') {
} else if name.contains(&b'\'') {
(Quotes::Double, true)
} else if *always_quote {
(Quotes::Single, true)
@ -351,30 +435,43 @@ fn escape_name_inner(name: &OsStr, style: &QuotingStyle, dirname: bool) -> Strin
};
let (escaped_str, contains_quote_chars) = if *escape {
shell_with_escape(&name, quotes)
shell_with_escape(name, quotes)
} else {
shell_without_escape(&name, quotes, *show_control)
shell_without_escape(name, quotes, *show_control)
};
match (must_quote | contains_quote_chars, quotes) {
(true, Quotes::Single) => format!("'{escaped_str}'"),
(true, Quotes::Double) => format!("\"{escaped_str}\""),
_ => escaped_str,
if must_quote | contains_quote_chars && quotes != Quotes::None {
let mut quoted_str = Vec::<u8>::with_capacity(escaped_str.len() + 2);
let quote = if quotes == Quotes::Single {
b'\''
} else {
b'"'
};
quoted_str.push(quote);
quoted_str.extend(escaped_str);
quoted_str.push(quote);
quoted_str
} else {
escaped_str
}
}
}
}
/// Escape a filename with respect to the given style.
pub fn escape_name(name: &OsStr, style: &QuotingStyle) -> String {
escape_name_inner(name, style, false)
pub fn escape_name(name: &OsStr, style: &QuotingStyle) -> OsString {
let name = crate::os_str_as_bytes_lossy(name);
crate::os_string_from_vec(escape_name_inner(&name, style, false))
.expect("all byte sequences should be valid for platform, or already replaced in name")
}
/// Escape a directory name with respect to the given style.
/// This is mainly meant to be used for ls' directory name printing and is not
/// likely to be used elsewhere.
pub fn escape_dir_name(dir_name: &OsStr, style: &QuotingStyle) -> String {
escape_name_inner(dir_name, style, true)
pub fn escape_dir_name(dir_name: &OsStr, style: &QuotingStyle) -> OsString {
let name = crate::os_str_as_bytes_lossy(dir_name);
crate::os_string_from_vec(escape_name_inner(&name, style, true))
.expect("all byte sequences should be valid for platform, or already replaced in name")
}
impl fmt::Display for QuotingStyle {
@ -415,7 +512,7 @@ impl fmt::Display for Quotes {
#[cfg(test)]
mod tests {
use crate::quoting_style::{escape_name, Quotes, QuotingStyle};
use crate::quoting_style::{escape_name_inner, Quotes, QuotingStyle};
// spell-checker:ignore (tests/words) one\'two one'two
@ -465,14 +562,31 @@ mod tests {
}
}
fn check_names_inner<T>(name: &[u8], map: &[(T, &str)]) -> Vec<Vec<u8>> {
map.iter()
.map(|(_, style)| escape_name_inner(name, &get_style(style), false))
.collect()
}
fn check_names(name: &str, map: &[(&str, &str)]) {
assert_eq!(
map.iter()
.map(|(_, style)| escape_name(name.as_ref(), &get_style(style)))
.collect::<Vec<String>>(),
.map(|(correct, _)| *correct)
.collect::<Vec<&str>>(),
check_names_inner(name.as_bytes(), map)
.iter()
.map(|bytes| std::str::from_utf8(bytes)
.expect("valid str goes in, valid str comes out"))
.collect::<Vec<&str>>()
);
}
fn check_names_raw(name: &[u8], map: &[(&[u8], &str)]) {
assert_eq!(
map.iter()
.map(|(correct, _)| correct.to_string())
.collect::<Vec<String>>()
.map(|(correct, _)| *correct)
.collect::<Vec<&[u8]>>(),
check_names_inner(name, map)
);
}
@ -487,10 +601,10 @@ mod tests {
("\"one_two\"", "c"),
("one_two", "shell"),
("one_two", "shell-show"),
("\'one_two\'", "shell-always"),
("\'one_two\'", "shell-always-show"),
("'one_two'", "shell-always"),
("'one_two'", "shell-always-show"),
("one_two", "shell-escape"),
("\'one_two\'", "shell-escape-always"),
("'one_two'", "shell-escape-always"),
],
);
}
@ -504,12 +618,12 @@ mod tests {
("one two", "literal-show"),
("one\\ two", "escape"),
("\"one two\"", "c"),
("\'one two\'", "shell"),
("\'one two\'", "shell-show"),
("\'one two\'", "shell-always"),
("\'one two\'", "shell-always-show"),
("\'one two\'", "shell-escape"),
("\'one two\'", "shell-escape-always"),
("'one two'", "shell"),
("'one two'", "shell-show"),
("'one two'", "shell-always"),
("'one two'", "shell-always-show"),
("'one two'", "shell-escape"),
("'one two'", "shell-escape-always"),
],
);
@ -551,7 +665,7 @@ mod tests {
// One single quote
check_names(
"one\'two",
"one'two",
&[
("one'two", "literal"),
("one'two", "literal-show"),
@ -637,7 +751,7 @@ mod tests {
],
);
// The first 16 control characters. NUL is also included, even though it is of
// The first 16 ASCII control characters. NUL is also included, even though it is of
// no importance for file names.
check_names(
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F",
@ -676,7 +790,7 @@ mod tests {
],
);
// The last 16 control characters.
// The last 16 ASCII control characters.
check_names(
"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
&[
@ -730,6 +844,265 @@ mod tests {
("''$'\\177'", "shell-escape-always"),
],
);
// The first 16 Unicode control characters.
let test_str = std::str::from_utf8(b"\xC2\x80\xC2\x81\xC2\x82\xC2\x83\xC2\x84\xC2\x85\xC2\x86\xC2\x87\xC2\x88\xC2\x89\xC2\x8A\xC2\x8B\xC2\x8C\xC2\x8D\xC2\x8E\xC2\x8F").unwrap();
check_names(
test_str,
&[
("????????????????", "literal"),
(test_str, "literal-show"),
("\\302\\200\\302\\201\\302\\202\\302\\203\\302\\204\\302\\205\\302\\206\\302\\207\\302\\210\\302\\211\\302\\212\\302\\213\\302\\214\\302\\215\\302\\216\\302\\217", "escape"),
("\"\\302\\200\\302\\201\\302\\202\\302\\203\\302\\204\\302\\205\\302\\206\\302\\207\\302\\210\\302\\211\\302\\212\\302\\213\\302\\214\\302\\215\\302\\216\\302\\217\"", "c"),
("????????????????", "shell"),
(test_str, "shell-show"),
("'????????????????'", "shell-always"),
(&format!("'{}'", test_str), "shell-always-show"),
("''$'\\302\\200\\302\\201\\302\\202\\302\\203\\302\\204\\302\\205\\302\\206\\302\\207\\302\\210\\302\\211\\302\\212\\302\\213\\302\\214\\302\\215\\302\\216\\302\\217'", "shell-escape"),
("''$'\\302\\200\\302\\201\\302\\202\\302\\203\\302\\204\\302\\205\\302\\206\\302\\207\\302\\210\\302\\211\\302\\212\\302\\213\\302\\214\\302\\215\\302\\216\\302\\217'", "shell-escape-always"),
],
);
// The last 16 Unicode control characters.
let test_str = std::str::from_utf8(b"\xC2\x90\xC2\x91\xC2\x92\xC2\x93\xC2\x94\xC2\x95\xC2\x96\xC2\x97\xC2\x98\xC2\x99\xC2\x9A\xC2\x9B\xC2\x9C\xC2\x9D\xC2\x9E\xC2\x9F").unwrap();
check_names(
test_str,
&[
("????????????????", "literal"),
(test_str, "literal-show"),
("\\302\\220\\302\\221\\302\\222\\302\\223\\302\\224\\302\\225\\302\\226\\302\\227\\302\\230\\302\\231\\302\\232\\302\\233\\302\\234\\302\\235\\302\\236\\302\\237", "escape"),
("\"\\302\\220\\302\\221\\302\\222\\302\\223\\302\\224\\302\\225\\302\\226\\302\\227\\302\\230\\302\\231\\302\\232\\302\\233\\302\\234\\302\\235\\302\\236\\302\\237\"", "c"),
("????????????????", "shell"),
(test_str, "shell-show"),
("'????????????????'", "shell-always"),
(&format!("'{}'", test_str), "shell-always-show"),
("''$'\\302\\220\\302\\221\\302\\222\\302\\223\\302\\224\\302\\225\\302\\226\\302\\227\\302\\230\\302\\231\\302\\232\\302\\233\\302\\234\\302\\235\\302\\236\\302\\237'", "shell-escape"),
("''$'\\302\\220\\302\\221\\302\\222\\302\\223\\302\\224\\302\\225\\302\\226\\302\\227\\302\\230\\302\\231\\302\\232\\302\\233\\302\\234\\302\\235\\302\\236\\302\\237'", "shell-escape-always"),
],
);
}
#[test]
fn test_non_unicode_bytes() {
let ascii = b'_';
let continuation = b'\xA7';
let first2byte = b'\xC2';
let first3byte = b'\xE0';
let first4byte = b'\xF0';
let invalid = b'\xC0';
// a single byte value invalid outside of additional context in UTF-8
check_names_raw(
&[continuation],
&[
(b"?", "literal"),
(b"\xA7", "literal-show"),
(b"\\247", "escape"),
(b"\"\\247\"", "c"),
(b"?", "shell"),
(b"\xA7", "shell-show"),
(b"'?'", "shell-always"),
(b"'\xA7'", "shell-always-show"),
(b"''$'\\247'", "shell-escape"),
(b"''$'\\247'", "shell-escape-always"),
],
);
// ...but the byte becomes valid with appropriate context
// (this is just the § character in UTF-8, written as bytes)
check_names_raw(
&[first2byte, continuation],
&[
(b"\xC2\xA7", "literal"),
(b"\xC2\xA7", "literal-show"),
(b"\xC2\xA7", "escape"),
(b"\"\xC2\xA7\"", "c"),
(b"\xC2\xA7", "shell"),
(b"\xC2\xA7", "shell-show"),
(b"'\xC2\xA7'", "shell-always"),
(b"'\xC2\xA7'", "shell-always-show"),
(b"\xC2\xA7", "shell-escape"),
(b"'\xC2\xA7'", "shell-escape-always"),
],
);
// mixed with valid characters
check_names_raw(
&[continuation, ascii],
&[
(b"?_", "literal"),
(b"\xA7_", "literal-show"),
(b"\\247_", "escape"),
(b"\"\\247_\"", "c"),
(b"?_", "shell"),
(b"\xA7_", "shell-show"),
(b"'?_'", "shell-always"),
(b"'\xA7_'", "shell-always-show"),
(b"''$'\\247''_'", "shell-escape"),
(b"''$'\\247''_'", "shell-escape-always"),
],
);
check_names_raw(
&[ascii, continuation],
&[
(b"_?", "literal"),
(b"_\xA7", "literal-show"),
(b"_\\247", "escape"),
(b"\"_\\247\"", "c"),
(b"_?", "shell"),
(b"_\xA7", "shell-show"),
(b"'_?'", "shell-always"),
(b"'_\xA7'", "shell-always-show"),
(b"'_'$'\\247'", "shell-escape"),
(b"'_'$'\\247'", "shell-escape-always"),
],
);
check_names_raw(
&[ascii, continuation, ascii],
&[
(b"_?_", "literal"),
(b"_\xA7_", "literal-show"),
(b"_\\247_", "escape"),
(b"\"_\\247_\"", "c"),
(b"_?_", "shell"),
(b"_\xA7_", "shell-show"),
(b"'_?_'", "shell-always"),
(b"'_\xA7_'", "shell-always-show"),
(b"'_'$'\\247''_'", "shell-escape"),
(b"'_'$'\\247''_'", "shell-escape-always"),
],
);
check_names_raw(
&[continuation, ascii, continuation],
&[
(b"?_?", "literal"),
(b"\xA7_\xA7", "literal-show"),
(b"\\247_\\247", "escape"),
(b"\"\\247_\\247\"", "c"),
(b"?_?", "shell"),
(b"\xA7_\xA7", "shell-show"),
(b"'?_?'", "shell-always"),
(b"'\xA7_\xA7'", "shell-always-show"),
(b"''$'\\247''_'$'\\247'", "shell-escape"),
(b"''$'\\247''_'$'\\247'", "shell-escape-always"),
],
);
// contiguous invalid bytes
check_names_raw(
&[
ascii,
invalid,
ascii,
continuation,
continuation,
ascii,
continuation,
continuation,
continuation,
ascii,
continuation,
continuation,
continuation,
continuation,
ascii,
],
&[
(b"_?_??_???_????_", "literal"),
(
b"_\xC0_\xA7\xA7_\xA7\xA7\xA7_\xA7\xA7\xA7\xA7_",
"literal-show",
),
(
b"_\\300_\\247\\247_\\247\\247\\247_\\247\\247\\247\\247_",
"escape",
),
(
b"\"_\\300_\\247\\247_\\247\\247\\247_\\247\\247\\247\\247_\"",
"c",
),
(b"_?_??_???_????_", "shell"),
(
b"_\xC0_\xA7\xA7_\xA7\xA7\xA7_\xA7\xA7\xA7\xA7_",
"shell-show",
),
(b"'_?_??_???_????_'", "shell-always"),
(
b"'_\xC0_\xA7\xA7_\xA7\xA7\xA7_\xA7\xA7\xA7\xA7_'",
"shell-always-show",
),
(
b"'_'$'\\300''_'$'\\247\\247''_'$'\\247\\247\\247''_'$'\\247\\247\\247\\247''_'",
"shell-escape",
),
(
b"'_'$'\\300''_'$'\\247\\247''_'$'\\247\\247\\247''_'$'\\247\\247\\247\\247''_'",
"shell-escape-always",
),
],
);
// invalid multi-byte sequences that start valid
check_names_raw(
&[first2byte, ascii],
&[
(b"?_", "literal"),
(b"\xC2_", "literal-show"),
(b"\\302_", "escape"),
(b"\"\\302_\"", "c"),
(b"?_", "shell"),
(b"\xC2_", "shell-show"),
(b"'?_'", "shell-always"),
(b"'\xC2_'", "shell-always-show"),
(b"''$'\\302''_'", "shell-escape"),
(b"''$'\\302''_'", "shell-escape-always"),
],
);
check_names_raw(
&[first2byte, first2byte, continuation],
&[
(b"?\xC2\xA7", "literal"),
(b"\xC2\xC2\xA7", "literal-show"),
(b"\\302\xC2\xA7", "escape"),
(b"\"\\302\xC2\xA7\"", "c"),
(b"?\xC2\xA7", "shell"),
(b"\xC2\xC2\xA7", "shell-show"),
(b"'?\xC2\xA7'", "shell-always"),
(b"'\xC2\xC2\xA7'", "shell-always-show"),
(b"''$'\\302''\xC2\xA7'", "shell-escape"),
(b"''$'\\302''\xC2\xA7'", "shell-escape-always"),
],
);
check_names_raw(
&[first3byte, continuation, ascii],
&[
(b"??_", "literal"),
(b"\xE0\xA7_", "literal-show"),
(b"\\340\\247_", "escape"),
(b"\"\\340\\247_\"", "c"),
(b"??_", "shell"),
(b"\xE0\xA7_", "shell-show"),
(b"'??_'", "shell-always"),
(b"'\xE0\xA7_'", "shell-always-show"),
(b"''$'\\340\\247''_'", "shell-escape"),
(b"''$'\\340\\247''_'", "shell-escape-always"),
],
);
check_names_raw(
&[first4byte, continuation, continuation, ascii],
&[
(b"???_", "literal"),
(b"\xF0\xA7\xA7_", "literal-show"),
(b"\\360\\247\\247_", "escape"),
(b"\"\\360\\247\\247_\"", "c"),
(b"???_", "shell"),
(b"\xF0\xA7\xA7_", "shell-show"),
(b"'???_'", "shell-always"),
(b"'\xF0\xA7\xA7_'", "shell-always-show"),
(b"''$'\\360\\247\\247''_'", "shell-escape"),
(b"''$'\\360\\247\\247''_'", "shell-escape-always"),
],
);
}
#[test]
@ -765,7 +1138,7 @@ mod tests {
("one\\\\two", "escape"),
("\"one\\\\two\"", "c"),
("'one\\two'", "shell"),
("\'one\\two\'", "shell-always"),
("'one\\two'", "shell-always"),
("'one\\two'", "shell-escape"),
("'one\\two'", "shell-escape-always"),
],

View file

@ -255,9 +255,10 @@ pub fn read_yes() -> bool {
}
}
/// Helper function for processing delimiter values (which could be non UTF-8)
/// It converts OsString to &[u8] for unix targets only
/// On non-unix (i.e. Windows) it will just return an error if delimiter value is not UTF-8
/// Converts an `OsStr` to a UTF-8 `&[u8]`.
///
/// This always succeeds on unix platforms,
/// and fails on other platforms if the string can't be coerced to UTF-8.
pub fn os_str_as_bytes(os_string: &OsStr) -> mods::error::UResult<&[u8]> {
#[cfg(unix)]
let bytes = os_string.as_bytes();
@ -273,13 +274,28 @@ pub fn os_str_as_bytes(os_string: &OsStr) -> mods::error::UResult<&[u8]> {
Ok(bytes)
}
/// Helper function for converting a slice of bytes into an &OsStr
/// or OsString in non-unix targets.
/// Performs a potentially lossy conversion from `OsStr` to UTF-8 bytes.
///
/// It converts `&[u8]` to `Cow<OsStr>` for unix targets only.
/// On non-unix (i.e. Windows), the conversion goes through the String type
/// and thus undergo UTF-8 validation, making it fail if the stream contains
/// non-UTF-8 characters.
/// This is always lossless on unix platforms,
/// and wraps [`OsStr::to_string_lossy`] on non-unix platforms.
pub fn os_str_as_bytes_lossy(os_string: &OsStr) -> Cow<[u8]> {
#[cfg(unix)]
let bytes = Cow::from(os_string.as_bytes());
#[cfg(not(unix))]
let bytes = match os_string.to_string_lossy() {
Cow::Borrowed(slice) => Cow::from(slice.as_bytes()),
Cow::Owned(owned) => Cow::from(owned.into_bytes()),
};
bytes
}
/// Converts a `&[u8]` to an `&OsStr`,
/// or parses it as UTF-8 into an [`OsString`] on non-unix platforms.
///
/// This always succeeds on unix platforms,
/// and fails on other platforms if the bytes can't be parsed as UTF-8.
pub fn os_str_from_bytes(bytes: &[u8]) -> mods::error::UResult<Cow<'_, OsStr>> {
#[cfg(unix)]
let os_str = Cow::Borrowed(OsStr::from_bytes(bytes));
@ -291,9 +307,10 @@ pub fn os_str_from_bytes(bytes: &[u8]) -> mods::error::UResult<Cow<'_, OsStr>> {
Ok(os_str)
}
/// Helper function for making an `OsString` from a byte field
/// It converts `Vec<u8>` to `OsString` for unix targets only.
/// On non-unix (i.e. Windows) it may fail if the bytes are not valid UTF-8
/// Converts a `Vec<u8>` into an `OsString`, parsing as UTF-8 on non-unix platforms.
///
/// This always succeeds on unix platforms,
/// and fails on other platforms if the bytes can't be parsed as UTF-8.
pub fn os_string_from_vec(vec: Vec<u8>) -> mods::error::UResult<OsString> {
#[cfg(unix)]
let s = OsString::from_vec(vec);