1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-28 19:47:45 +00:00

shuf: Use OS strings, don't split individual arguments, cleanup

- shuf now uses OS strings, so it can read from filenames that are
  invalid Unicode and it can shuffle arguments that are invalid
  Unicode. `uucore` now has an `OsWrite` trait to support this without
  platform-specific boilerplate.

- shuf no longer tries to split individual command line arguments,
  only bulk input from a file/stdin. (This matches GNU and busybox.)

- More values are parsed inside clap instead of manually, leading to
  better error messages and less code.

- Some code has been simplified or made more idiomatic.
This commit is contained in:
Jan Verbeek 2024-03-02 12:12:35 +01:00
parent 87ec8285c3
commit f562543b6c
6 changed files with 250 additions and 155 deletions

1
Cargo.lock generated
View file

@ -3184,7 +3184,6 @@ name = "uu_shuf"
version = "0.0.30"
dependencies = [
"clap",
"memchr",
"rand 0.9.0",
"rand_core 0.9.3",
"uucore",

View file

@ -18,7 +18,6 @@ path = "src/shuf.rs"
[dependencies]
clap = { workspace = true }
memchr = { workspace = true }
rand = { workspace = true }
rand_core = { workspace = true }
uucore = { workspace = true }

View file

@ -5,23 +5,27 @@
// spell-checker:ignore (ToDO) cmdline evec nonrepeating seps shufable rvec fdata
use clap::builder::ValueParser;
use clap::{Arg, ArgAction, Command};
use memchr::memchr_iter;
use rand::prelude::{IndexedRandom, SliceRandom};
use rand::prelude::SliceRandom;
use rand::seq::IndexedRandom;
use rand::{Rng, RngCore};
use std::collections::HashSet;
use std::ffi::{OsStr, OsString};
use std::fs::File;
use std::io::{stdin, stdout, BufReader, BufWriter, Error, Read, Write};
use std::io::{stdin, stdout, BufWriter, Error, Read, Write};
use std::ops::RangeInclusive;
use uucore::display::Quotable;
use std::path::{Path, PathBuf};
use std::str::FromStr;
use uucore::display::{OsWrite, Quotable};
use uucore::error::{FromIo, UResult, USimpleError, UUsageError};
use uucore::{format_usage, help_about, help_usage};
mod rand_read_adapter;
enum Mode {
Default(String),
Echo(Vec<String>),
Default(PathBuf),
Echo(Vec<OsString>),
InputRange(RangeInclusive<usize>),
}
@ -30,8 +34,8 @@ static ABOUT: &str = help_about!("shuf.md");
struct Options {
head_count: usize,
output: Option<String>,
random_source: Option<String>,
output: Option<PathBuf>,
random_source: Option<PathBuf>,
repeat: bool,
sep: u8,
}
@ -54,53 +58,45 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
let mode = if matches.get_flag(options::ECHO) {
Mode::Echo(
matches
.get_many::<String>(options::FILE_OR_ARGS)
.get_many(options::FILE_OR_ARGS)
.unwrap_or_default()
.map(String::from)
.cloned()
.collect(),
)
} else if let Some(range) = matches.get_one::<String>(options::INPUT_RANGE) {
match parse_range(range) {
Ok(m) => Mode::InputRange(m),
Err(msg) => {
return Err(USimpleError::new(1, msg));
}
}
} else if let Some(range) = matches.get_one(options::INPUT_RANGE).cloned() {
Mode::InputRange(range)
} else {
let mut operands = matches
.get_many::<String>(options::FILE_OR_ARGS)
.get_many::<OsString>(options::FILE_OR_ARGS)
.unwrap_or_default();
let file = operands.next().cloned().unwrap_or("-".into());
if let Some(second_file) = operands.next() {
return Err(UUsageError::new(
1,
format!("unexpected argument '{second_file}' found"),
format!("unexpected argument {} found", second_file.quote()),
));
};
Mode::Default(file)
Mode::Default(file.into())
};
let options = Options {
head_count: {
let headcounts = matches
.get_many::<String>(options::HEAD_COUNT)
// GNU shuf takes the lowest value passed, so we imitate that.
// It's probably a bug or an implementation artifact though.
// Busybox takes the final value which is more typical: later
// options override earlier options.
head_count: matches
.get_many::<usize>(options::HEAD_COUNT)
.unwrap_or_default()
.cloned()
.collect();
match parse_head_count(headcounts) {
Ok(val) => val,
Err(msg) => return Err(USimpleError::new(1, msg)),
}
},
output: matches.get_one::<String>(options::OUTPUT).map(String::from),
random_source: matches
.get_one::<String>(options::RANDOM_SOURCE)
.map(String::from),
.min()
.unwrap_or(usize::MAX),
output: matches.get_one(options::OUTPUT).cloned(),
random_source: matches.get_one(options::RANDOM_SOURCE).cloned(),
repeat: matches.get_flag(options::REPEAT),
sep: if matches.get_flag(options::ZERO_TERMINATED) {
0x00_u8
b'\0'
} else {
0x0a_u8
b'\n'
},
};
@ -108,7 +104,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
// Do not attempt to read the random source or the input file.
// However, we must touch the output file, if given:
if let Some(s) = options.output {
File::create(&s[..])
File::create(&s)
.map_err_context(|| format!("failed to open {} for writing", s.quote()))?;
}
return Ok(());
@ -116,8 +112,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
match mode {
Mode::Echo(args) => {
let mut evec = args.iter().map(String::as_bytes).collect::<Vec<_>>();
find_seps(&mut evec, options.sep);
let mut evec: Vec<&OsStr> = args.iter().map(AsRef::as_ref).collect();
shuf_exec(&mut evec, options)?;
}
Mode::InputRange(mut range) => {
@ -125,9 +120,8 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
}
Mode::Default(filename) => {
let fdata = read_input_file(&filename)?;
let mut fdata = vec![&fdata[..]];
find_seps(&mut fdata, options.sep);
shuf_exec(&mut fdata, options)?;
let mut items = split_seps(&fdata, options.sep);
shuf_exec(&mut items, options)?;
}
}
@ -155,6 +149,7 @@ pub fn uu_app() -> Command {
.long(options::INPUT_RANGE)
.value_name("LO-HI")
.help("treat each number LO through HI as an input line")
.value_parser(parse_range)
.conflicts_with(options::FILE_OR_ARGS),
)
.arg(
@ -163,7 +158,8 @@ pub fn uu_app() -> Command {
.long(options::HEAD_COUNT)
.value_name("COUNT")
.action(clap::ArgAction::Append)
.help("output at most COUNT lines"),
.help("output at most COUNT lines")
.value_parser(usize::from_str),
)
.arg(
Arg::new(options::OUTPUT)
@ -171,6 +167,7 @@ pub fn uu_app() -> Command {
.long(options::OUTPUT)
.value_name("FILE")
.help("write result to FILE instead of standard output")
.value_parser(ValueParser::path_buf())
.value_hint(clap::ValueHint::FilePath),
)
.arg(
@ -178,6 +175,7 @@ pub fn uu_app() -> Command {
.long(options::RANDOM_SOURCE)
.value_name("FILE")
.help("get random bytes from FILE")
.value_parser(ValueParser::path_buf())
.value_hint(clap::ValueHint::FilePath),
)
.arg(
@ -199,56 +197,33 @@ pub fn uu_app() -> Command {
.arg(
Arg::new(options::FILE_OR_ARGS)
.action(clap::ArgAction::Append)
.value_parser(ValueParser::os_string())
.value_hint(clap::ValueHint::FilePath),
)
}
fn read_input_file(filename: &str) -> UResult<Vec<u8>> {
let mut file = BufReader::new(if filename == "-" {
Box::new(stdin()) as Box<dyn Read>
} else {
let file = File::open(filename)
.map_err_context(|| format!("failed to open {}", filename.quote()))?;
Box::new(file) as Box<dyn Read>
});
fn read_input_file(filename: &Path) -> UResult<Vec<u8>> {
if filename.as_os_str() == "-" {
let mut data = Vec::new();
file.read_to_end(&mut data)
.map_err_context(|| format!("failed reading {}", filename.quote()))?;
stdin()
.read_to_end(&mut data)
.map_err_context(|| "read error".into())?;
Ok(data)
} else {
std::fs::read(filename).map_err_context(|| filename.maybe_quote().to_string())
}
}
fn find_seps(data: &mut Vec<&[u8]>, sep: u8) {
// Special case: If data is empty (and does not even contain a single 'sep'
// to indicate the presence of the empty element), then behave as if the input contained no elements at all.
if data.len() == 1 && data[0].is_empty() {
data.clear();
return;
}
// need to use for loop so we don't borrow the vector as we modify it in place
// basic idea:
// * We don't care about the order of the result. This lets us slice the slices
// without making a new vector.
// * Starting from the end of the vector, we examine each element.
// * If that element contains the separator, we remove it from the vector,
// and then sub-slice it into slices that do not contain the separator.
// * We maintain the invariant throughout that each element in the vector past
// the ith element does not have any separators remaining.
for i in (0..data.len()).rev() {
if data[i].contains(&sep) {
let this = data.swap_remove(i);
let mut p = 0;
for i in memchr_iter(sep, this) {
data.push(&this[p..i]);
p = i + 1;
}
if p < this.len() {
data.push(&this[p..]);
}
}
fn split_seps(data: &[u8], sep: u8) -> Vec<&[u8]> {
// A single trailing separator is ignored.
// If data is empty (and does not even contain a single 'sep'
// to indicate the presence of an empty element), then behave
// as if the input contained no elements at all.
let mut elements: Vec<&[u8]> = data.split(|&b| b == sep).collect();
if elements.last().is_some_and(|e| e.is_empty()) {
elements.pop();
}
elements
}
trait Shufable {
@ -293,6 +268,24 @@ impl<'a> Shufable for Vec<&'a [u8]> {
}
}
impl<'a> Shufable for Vec<&'a OsStr> {
type Item = &'a OsStr;
fn is_empty(&self) -> bool {
(**self).is_empty()
}
fn choose(&self, rng: &mut WrappedRng) -> Self::Item {
(**self).choose(rng).unwrap()
}
type PartialShuffleIterator<'b> = std::iter::Copied<std::slice::Iter<'b, &'a OsStr>> where Self: 'b;
fn partial_shuffle<'b>(
&'b mut self,
rng: &'b mut WrappedRng,
amount: usize,
) -> Self::PartialShuffleIterator<'b> {
(**self).partial_shuffle(rng, amount).0.iter().copied()
}
}
impl Shufable for RangeInclusive<usize> {
type Item = usize;
fn is_empty(&self) -> bool {
@ -330,7 +323,7 @@ impl<'a> NonrepeatingIterator<'a> {
fn new(range: RangeInclusive<usize>, rng: &'a mut WrappedRng, amount: usize) -> Self {
let capped_amount = if range.start() > range.end() {
0
} else if *range.start() == 0 && *range.end() == usize::MAX {
} else if range == (0..=usize::MAX) {
amount
} else {
amount.min(range.end() - range.start() + 1)
@ -404,34 +397,40 @@ fn number_set_should_list_remaining(listed_count: usize, range_size: usize) -> b
}
trait Writable {
fn write_all_to(&self, output: &mut impl Write) -> Result<(), Error>;
fn write_all_to(&self, output: &mut impl OsWrite) -> Result<(), Error>;
}
impl Writable for &[u8] {
fn write_all_to(&self, output: &mut impl Write) -> Result<(), Error> {
fn write_all_to(&self, output: &mut impl OsWrite) -> Result<(), Error> {
output.write_all(self)
}
}
impl Writable for &OsStr {
fn write_all_to(&self, output: &mut impl OsWrite) -> Result<(), Error> {
output.write_all_os(self)
}
}
impl Writable for usize {
fn write_all_to(&self, output: &mut impl Write) -> Result<(), Error> {
output.write_all(format!("{self}").as_bytes())
fn write_all_to(&self, output: &mut impl OsWrite) -> Result<(), Error> {
write!(output, "{self}")
}
}
fn shuf_exec(input: &mut impl Shufable, opts: Options) -> UResult<()> {
let mut output = BufWriter::new(match opts.output {
None => Box::new(stdout()) as Box<dyn Write>,
None => Box::new(stdout()) as Box<dyn OsWrite>,
Some(s) => {
let file = File::create(&s[..])
let file = File::create(&s)
.map_err_context(|| format!("failed to open {} for writing", s.quote()))?;
Box::new(file) as Box<dyn Write>
Box::new(file) as Box<dyn OsWrite>
}
});
let mut rng = match opts.random_source {
Some(r) => {
let file = File::open(&r[..])
let file = File::open(&r)
.map_err_context(|| format!("failed to open random source {}", r.quote()))?;
WrappedRng::RngFile(rand_read_adapter::ReadRng::new(file))
}
@ -467,33 +466,18 @@ fn shuf_exec(input: &mut impl Shufable, opts: Options) -> UResult<()> {
fn parse_range(input_range: &str) -> Result<RangeInclusive<usize>, String> {
if let Some((from, to)) = input_range.split_once('-') {
let begin = from
.parse::<usize>()
.map_err(|_| format!("invalid input range: {}", from.quote()))?;
let end = to
.parse::<usize>()
.map_err(|_| format!("invalid input range: {}", to.quote()))?;
let begin = from.parse::<usize>().map_err(|e| e.to_string())?;
let end = to.parse::<usize>().map_err(|e| e.to_string())?;
if begin <= end || begin == end + 1 {
Ok(begin..=end)
} else {
Err(format!("invalid input range: {}", input_range.quote()))
Err("start exceeds end".into())
}
} else {
Err(format!("invalid input range: {}", input_range.quote()))
Err("missing '-'".into())
}
}
fn parse_head_count(headcounts: Vec<String>) -> Result<usize, String> {
let mut result = usize::MAX;
for count in headcounts {
match count.parse::<usize>() {
Ok(pv) => result = std::cmp::min(result, pv),
Err(_) => return Err(format!("invalid line count: {}", count.quote())),
}
}
Ok(result)
}
enum WrappedRng {
RngFile(rand_read_adapter::ReadRng<File>),
RngDefault(rand::rngs::ThreadRng),
@ -522,6 +506,31 @@ impl RngCore for WrappedRng {
}
}
#[cfg(test)]
mod test_split_seps {
use super::split_seps;
#[test]
fn test_empty_input() {
assert!(split_seps(b"", b'\n').is_empty());
}
#[test]
fn test_single_blank_line() {
assert_eq!(split_seps(b"\n", b'\n'), &[b""]);
}
#[test]
fn test_with_trailing() {
assert_eq!(split_seps(b"a\nb\nc\n", b'\n'), &[b"a", b"b", b"c"]);
}
#[test]
fn test_without_trailing() {
assert_eq!(split_seps(b"a\nb\nc", b'\n'), &[b"a", b"b", b"c"]);
}
}
#[cfg(test)]
// Since the computed value is a bool, it is more readable to write the expected value out:
#[allow(clippy::bool_assert_comparison)]

View file

@ -25,7 +25,8 @@
//! ```
use std::ffi::OsStr;
use std::io::{self, Write as IoWrite};
use std::fs::File;
use std::io::{self, BufWriter, Stdout, StdoutLock, Write as IoWrite};
#[cfg(unix)]
use std::os::unix::ffi::OsStrExt;
@ -42,33 +43,77 @@ pub use os_display::{Quotable, Quoted};
/// output is likely to be captured, like `pwd` and `basename`. For informational output
/// use `Quotable::quote`.
///
/// FIXME: This is lossy on Windows. It could probably be implemented using some low-level
/// API that takes UTF-16, without going through io::Write. This is not a big priority
/// FIXME: Invalid Unicode will produce an error on Windows. That could be fixed by
/// using low-level library calls and bypassing `io::Write`. This is not a big priority
/// because broken filenames are much rarer on Windows than on Unix.
pub fn println_verbatim<S: AsRef<OsStr>>(text: S) -> io::Result<()> {
let stdout = io::stdout();
let mut stdout = stdout.lock();
#[cfg(any(unix, target_os = "wasi"))]
{
stdout.write_all(text.as_ref().as_bytes())?;
let mut stdout = io::stdout().lock();
stdout.write_all_os(text.as_ref())?;
stdout.write_all(b"\n")?;
}
#[cfg(not(any(unix, target_os = "wasi")))]
{
writeln!(stdout, "{}", std::path::Path::new(text.as_ref()).display())?;
}
Ok(())
}
/// Like `println_verbatim`, without the trailing newline.
pub fn print_verbatim<S: AsRef<OsStr>>(text: S) -> io::Result<()> {
let mut stdout = io::stdout();
io::stdout().write_all_os(text.as_ref())
}
/// [`io::Write`], but for OS strings.
///
/// On Unix this works straightforwardly.
///
/// On Windows this currently returns an error if the OS string is not valid Unicode.
/// This may in the future change to allow those strings to be written to consoles.
pub trait OsWrite: io::Write {
/// Write the entire OS string into this writer.
///
/// # Errors
///
/// An error is returned if the underlying I/O operation fails.
///
/// On Windows, if the OS string is not valid Unicode, an error of kind
/// [`io::ErrorKind::InvalidData`] is returned.
fn write_all_os(&mut self, buf: &OsStr) -> io::Result<()> {
#[cfg(any(unix, target_os = "wasi"))]
{
stdout.write_all(text.as_ref().as_bytes())
self.write_all(buf.as_bytes())
}
#[cfg(not(any(unix, target_os = "wasi")))]
{
write!(stdout, "{}", std::path::Path::new(text.as_ref()).display())
// It's possible to write a better OsWrite impl for Windows consoles (e.g. Stdout)
// as those are fundamentally 16-bit. If the OS string is invalid then it can be
// encoded to 16-bit and written using raw windows_sys calls. But this is quite involved
// (see `sys/pal/windows/stdio.rs` in the stdlib) and the value-add is small.
//
// There's no way to write invalid OS strings to Windows files, as those are 8-bit.
match buf.to_str() {
Some(text) => self.write_all(text.as_bytes()),
// We could output replacement characters instead, but the
// stdlib errors when sending invalid UTF-8 to the console,
// so let's follow that.
None => Err(io::Error::new(
io::ErrorKind::InvalidData,
"OS string cannot be converted to bytes",
)),
}
}
}
}
// We do not have a blanket impl for all Write because a smarter Windows impl should
// be able to make use of AsRawHandle. Please keep this in mind when adding new impls.
impl OsWrite for File {}
impl OsWrite for Stdout {}
impl OsWrite for StdoutLock<'_> {}
// A future smarter Windows implementation can first flush the BufWriter before
// doing a raw write.
impl<W: OsWrite> OsWrite for BufWriter<W> {}
impl OsWrite for Box<dyn OsWrite> {
fn write_all_os(&mut self, buf: &OsStr) -> io::Result<()> {
let this: &mut dyn OsWrite = self;
this.write_all_os(buf)
}
}

View file

@ -362,6 +362,51 @@ fn test_echo_short_collapsed_zero() {
assert_eq!(result_seq, ["a", "b", "c"], "Output is not a permutation");
}
#[test]
fn test_echo_separators_in_arguments() {
// We used to split arguments themselves on newlines, but this was wrong.
// shuf should behave as though it's shuffling two arguments and therefore
// output all of them.
// (Note that arguments can't contain null bytes so we don't need to test that.)
let result = new_ucmd!()
.arg("-e")
.arg("-n2")
.arg("a\nb")
.arg("c\nd")
.succeeds();
result.no_stderr();
assert_eq!(result.stdout_str().len(), 8, "Incorrect output length");
}
#[cfg(unix)]
#[test]
fn test_echo_invalid_unicode_in_arguments() {
use std::{ffi::OsStr, os::unix::ffi::OsStrExt};
let result = new_ucmd!()
.arg("-e")
.arg(OsStr::from_bytes(b"a\xFFb"))
.arg("ok")
.succeeds();
result.no_stderr();
assert!(result.stdout().contains(&b'\xFF'));
}
#[cfg(any(unix, target_os = "wasi"))]
#[cfg(not(target_os = "macos"))]
#[test]
fn test_invalid_unicode_in_filename() {
use std::{ffi::OsStr, os::unix::ffi::OsStrExt};
let (at, mut ucmd) = at_and_ucmd!();
let filename = OsStr::from_bytes(b"a\xFFb");
at.append(filename, "foo\n");
let result = ucmd.arg(filename).succeeds();
result.no_stderr();
assert_eq!(result.stdout(), b"foo\n");
}
#[test]
fn test_head_count() {
let repeat_limit = 5;
@ -647,23 +692,21 @@ fn test_shuf_invalid_input_range_one() {
new_ucmd!()
.args(&["-i", "0"])
.fails()
.stderr_contains("invalid input range");
.stderr_contains("invalid value '0' for '--input-range <LO-HI>': missing '-'");
}
#[test]
fn test_shuf_invalid_input_range_two() {
new_ucmd!()
.args(&["-i", "a-9"])
.fails()
.stderr_contains("invalid input range: 'a'");
new_ucmd!().args(&["-i", "a-9"]).fails().stderr_contains(
"invalid value 'a-9' for '--input-range <LO-HI>': invalid digit found in string",
);
}
#[test]
fn test_shuf_invalid_input_range_three() {
new_ucmd!()
.args(&["-i", "0-b"])
.fails()
.stderr_contains("invalid input range: 'b'");
new_ucmd!().args(&["-i", "0-b"]).fails().stderr_contains(
"invalid value '0-b' for '--input-range <LO-HI>': invalid digit found in string",
);
}
#[test]
@ -702,10 +745,9 @@ fn test_shuf_three_input_files() {
#[test]
fn test_shuf_invalid_input_line_count() {
new_ucmd!()
.args(&["-n", "a"])
.fails()
.stderr_contains("invalid line count: 'a'");
new_ucmd!().args(&["-n", "a"]).fails().stderr_contains(
"invalid value 'a' for '--head-count <COUNT>': invalid digit found in string",
);
}
#[test]
@ -772,7 +814,7 @@ fn test_range_empty_minus_one() {
.arg("-i5-3")
.fails()
.no_stdout()
.stderr_only("shuf: invalid input range: '5-3'\n");
.stderr_contains("invalid value '5-3' for '--input-range <LO-HI>': start exceeds end\n");
}
#[test]
@ -802,5 +844,5 @@ fn test_range_repeat_empty_minus_one() {
.arg("-ri5-3")
.fails()
.no_stdout()
.stderr_only("shuf: invalid input range: '5-3'\n");
.stderr_contains("invalid value '5-3' for '--input-range <LO-HI>': start exceeds end\n");
}

View file

@ -892,7 +892,8 @@ impl AtPath {
.unwrap_or_else(|e| panic!("Couldn't write {name}: {e}"));
}
pub fn append(&self, name: &str, contents: &str) {
pub fn append(&self, name: impl AsRef<Path>, contents: &str) {
let name = name.as_ref();
log_info("write(append)", self.plus_as_string(name));
let mut f = OpenOptions::new()
.append(true)
@ -900,7 +901,7 @@ impl AtPath {
.open(self.plus(name))
.unwrap();
f.write_all(contents.as_bytes())
.unwrap_or_else(|e| panic!("Couldn't write(append) {name}: {e}"));
.unwrap_or_else(|e| panic!("Couldn't write(append) {}: {e}", name.display()));
}
pub fn append_bytes(&self, name: &str, contents: &[u8]) {