From c4b53a44b5629b17fb85a0818c803a475a6071ea Mon Sep 17 00:00:00 2001 From: Jed Denlea Date: Fri, 31 Mar 2023 23:57:53 -0700 Subject: [PATCH] wc: make --files0-from work with streams --- src/uu/wc/src/countable.rs | 1 + src/uu/wc/src/wc.rs | 448 +++++++++++++++++++++++++------------ tests/by-util/test_wc.rs | 22 +- 3 files changed, 315 insertions(+), 156 deletions(-) diff --git a/src/uu/wc/src/countable.rs b/src/uu/wc/src/countable.rs index 5596decb3..b86b96fa2 100644 --- a/src/uu/wc/src/countable.rs +++ b/src/uu/wc/src/countable.rs @@ -28,6 +28,7 @@ impl WordCountable for StdinLock<'_> { self } } + impl WordCountable for File { type Buffered = BufReader; diff --git a/src/uu/wc/src/wc.rs b/src/uu/wc/src/wc.rs index 2ed43e16a..08d00db18 100644 --- a/src/uu/wc/src/wc.rs +++ b/src/uu/wc/src/wc.rs @@ -13,11 +13,12 @@ mod utf8; mod word_count; use std::{ - borrow::Cow, + borrow::{Borrow, Cow}, cmp::max, - ffi::{OsStr, OsString}, + ffi::OsString, fs::{self, File}, - io::{self, Read, Write}, + io::{self, Write}, + iter, path::{Path, PathBuf}, }; @@ -42,17 +43,17 @@ use crate::{ /// The minimum character width for formatting counts when reading from stdin. const MINIMUM_WIDTH: usize = 7; -struct Settings { +struct Settings<'a> { show_bytes: bool, show_chars: bool, show_lines: bool, show_words: bool, show_max_line_length: bool, - files0_from_path: Option, + files0_from: Option>, total_when: TotalWhen, } -impl Default for Settings { +impl Default for Settings<'_> { fn default() -> Self { // Defaults if none of -c, -m, -l, -w, nor -L are specified. Self { @@ -61,15 +62,15 @@ impl Default for Settings { show_lines: true, show_words: true, show_max_line_length: false, - files0_from_path: None, + files0_from: None, total_when: TotalWhen::default(), } } } -impl Settings { - fn new(matches: &ArgMatches) -> Self { - let files0_from_path = matches +impl<'a> Settings<'a> { + fn new(matches: &'a ArgMatches) -> Self { + let files0_from = matches .get_one::(options::FILES0_FROM) .map(Into::into); @@ -84,7 +85,7 @@ impl Settings { show_lines: matches.get_flag(options::LINES), show_words: matches.get_flag(options::WORDS), show_max_line_length: matches.get_flag(options::MAX_LINE_LENGTH), - files0_from_path, + files0_from, total_when, }; @@ -92,7 +93,7 @@ impl Settings { settings } else { Self { - files0_from_path: settings.files0_from_path, + files0_from: settings.files0_from, total_when, ..Default::default() } @@ -139,6 +140,74 @@ static QS_QUOTE_ESCAPE: &QuotingStyle = &QuotingStyle::Shell { show_control: false, }; +/// Supported inputs. +#[derive(Debug)] +enum Inputs<'a> { + /// Default Standard input, i.e. no arguments. + Stdin, + /// Files; "-" means stdin, possibly multiple times! + Paths(Vec>), + /// --files0-from; "-" means stdin. + Files0From(Input<'a>), +} + +impl<'a> Inputs<'a> { + fn new(matches: &'a ArgMatches) -> UResult { + let arg_files = matches.get_many::(ARG_FILES); + let files0_from = matches.get_one::(options::FILES0_FROM); + + match (arg_files, files0_from) { + (None, None) => Ok(Self::Stdin), + (Some(files), None) => Ok(Self::Paths(files.map(Into::into).collect())), + (None, Some(path)) => { + // If path is a file, and the file isn't too large, we'll load it ahead + // of time. Every path within the file will have its length checked to + // hopefully better align the output columns. + let input = Input::from(path); + match input.try_as_files0()? { + Some(paths) => Ok(Self::Paths(paths)), + None => Ok(Self::Files0From(input)), + } + } + (Some(_), Some(_)) => Err(WcError::FilesDisabled.into()), + } + } + + // Creates an iterator which yields values borrowed from the command line arguments. + // Returns an error if the file specified in --files0-from cannot be opened. + fn try_iter( + &'a self, + settings: &'a Settings<'a>, + ) -> UResult>> { + let base: Box> = match self { + Self::Stdin => Box::new(iter::once(Ok(Input::Stdin(StdinKind::Implicit)))), + Self::Paths(inputs) => Box::new(inputs.iter().map(|i| Ok(i.as_borrowed()))), + Self::Files0From(input) => match input { + Input::Path(path) => Box::new(files0_iter_file(path)?), + Input::Stdin(_) => Box::new(files0_iter_stdin()), + }, + }; + + // The index of each yielded item must be tracked for error reporting. + let mut with_idx = base.enumerate(); + let files0_from_path = settings.files0_from.as_ref().map(|i| i.as_borrowed()); + + let iter = iter::from_fn(move || { + let (idx, next) = with_idx.next()?; + match next { + // filter zero length file names... + Ok(Input::Path(p)) if p.as_os_str().is_empty() => Some(Err({ + let maybe_ctx = files0_from_path.as_ref().map(|p| (p, idx)); + WcError::zero_len(maybe_ctx).into() + })), + _ => Some(next), + } + }); + Ok(iter) + } +} + +#[derive(Clone, Copy, Debug)] enum StdinKind { /// Specified on command-line with "-" (STDIN_REPR) Explicit, @@ -146,26 +215,44 @@ enum StdinKind { Implicit, } -/// Supported inputs. -enum Input { - /// A regular file. - Path(PathBuf), - - /// Standard input. +/// Represents a single input, either to be counted or processed for other files names via +/// --files0-from. +#[derive(Debug)] +enum Input<'a> { + Path(Cow<'a, Path>), Stdin(StdinKind), } -impl From<&OsStr> for Input { - fn from(input: &OsStr) -> Self { - if input == STDIN_REPR { +impl From for Input<'_> { + fn from(p: PathBuf) -> Self { + if p.as_os_str() == STDIN_REPR { Self::Stdin(StdinKind::Explicit) } else { - Self::Path(input.into()) + Self::Path(Cow::Owned(p)) } } } -impl Input { +impl<'a, T: AsRef + ?Sized> From<&'a T> for Input<'a> { + fn from(p: &'a T) -> Self { + let p = p.as_ref(); + if p.as_os_str() == STDIN_REPR { + Self::Stdin(StdinKind::Explicit) + } else { + Self::Path(Cow::Borrowed(p)) + } + } +} + +impl<'a> Input<'a> { + /// Translates Path(Cow::Owned(_)) to Path(Cow::Borrowed(_)). + fn as_borrowed(&'a self) -> Self { + match self { + Self::Path(p) => Self::Path(Cow::Borrowed(p.borrow())), + Self::Stdin(k) => Self::Stdin(*k), + } + } + /// Converts input to title that appears in stats. fn to_title(&self) -> Option> { match self { @@ -185,6 +272,56 @@ impl Input { Self::Stdin(_) => String::from("standard input"), } } + + /// When given --files0-from, we may be given a path or stdin. Either may be a stream or + /// a regular file. If given a file less than 10 MiB, it will be consumed and turned into + /// a Vec of Input::Paths which can be scanned to determine the widths of the columns that + /// will ultimately be printed. + fn try_as_files0(&self) -> UResult>>> { + match self { + Self::Path(path) => match fs::metadata(path) { + Ok(meta) if meta.is_file() && meta.len() <= (10 << 20) => Ok(Some( + files0_iter_file(path)?.collect::, _>>()?, + )), + _ => Ok(None), + }, + Self::Stdin(_) if is_stdin_small_file() => { + Ok(Some(files0_iter_stdin().collect::, _>>()?)) + } + Self::Stdin(_) => Ok(None), + } + } +} + +#[cfg(unix)] +fn is_stdin_small_file() -> bool { + use std::os::unix::io::{AsRawFd, FromRawFd}; + // Safety: we'll rely on Rust to give us a valid RawFd for stdin with which we can attempt to + // open a File, but only for the sake of fetching .metadata(). ManuallyDrop will ensure we + // don't do anything else to the FD if anything unexpected happens. + let f = std::mem::ManuallyDrop::new(unsafe { File::from_raw_fd(io::stdin().as_raw_fd()) }); + matches!(f.metadata(), Ok(meta) if meta.is_file() && meta.len() <= (10 << 20)) +} + +#[cfg(windows)] +fn is_stdin_small_file() -> bool { + use std::os::windows::io::{AsRawHandle, FromRawHandle}; + // Safety: we'll rely on Rust to give us a valid RawHandle for stdin with which we can attempt + // to open a File, but only for the sake of fetching .metadata(). ManuallyDrop will ensure we + // don't do anything else to the FD if anything unexpected happens. + let f = std::mem::ManuallyDrop::new(unsafe { + let h = io::stdin().as_raw_handle(); + if h.is_null() { + return false; + } + File::from_raw_handle(h) + }); + matches!(f.metadata(), Ok(meta) if meta.is_file() && meta.len() <= (10 << 20)) +} + +#[cfg(not(any(unix, windows)))] +fn is_stdin_small_file() -> bool { + false } /// When to show the "total" line @@ -228,14 +365,17 @@ enum WcError { #[error("invalid zero-length file name")] ZeroLengthFileName, #[error("{path}:{idx}: invalid zero-length file name")] - ZeroLengthFileNameCtx { path: String, idx: usize }, + ZeroLengthFileNameCtx { path: Cow<'static, str>, idx: usize }, } impl WcError { - fn zero_len(ctx: Option<(&Path, usize)>) -> Self { + fn zero_len(ctx: Option<(&Input, usize)>) -> Self { match ctx { - Some((path, idx)) => { - let path = escape_name(path.as_os_str(), QS_ESCAPE); + Some((input, idx)) => { + let path = match input { + Input::Stdin(_) => STDIN_REPR.into(), + Input::Path(path) => escape_name(path.as_os_str(), QS_ESCAPE).into(), + }; Self::ZeroLengthFileNameCtx { path, idx } } None => Self::ZeroLengthFileName, @@ -253,9 +393,8 @@ impl UError for WcError { pub fn uumain(args: impl uucore::Args) -> UResult<()> { let matches = uu_app().try_get_matches_from(args)?; - let inputs = inputs(&matches)?; - let settings = Settings::new(&matches); + let inputs = Inputs::new(&matches)?; wc(&inputs, &settings) } @@ -332,48 +471,6 @@ pub fn uu_app() -> Command { ) } -fn inputs(matches: &ArgMatches) -> UResult> { - match matches.get_many::(ARG_FILES) { - Some(os_values) => { - if matches.contains_id(options::FILES0_FROM) { - return Err(WcError::FilesDisabled.into()); - } - - Ok(os_values.map(|s| Input::from(s.as_os_str())).collect()) - } - None => match matches.get_one::(options::FILES0_FROM) { - Some(files_0_from) => create_paths_from_files0(Path::new(files_0_from)), - None => Ok(vec![Input::Stdin(StdinKind::Implicit)]), - }, - } -} - -fn create_paths_from_files0(files_0_from: &Path) -> UResult> { - let mut paths = String::new(); - let read_from_stdin = files_0_from.as_os_str() == STDIN_REPR; - - if read_from_stdin { - io::stdin().lock().read_to_string(&mut paths)?; - } else { - File::open(files_0_from) - .map_err_context(|| { - format!( - "cannot open {} for reading", - escape_name(files_0_from.as_os_str(), QS_QUOTE_ESCAPE) - ) - })? - .read_to_string(&mut paths)?; - } - - let paths: Vec<&str> = paths.split_terminator('\0').collect(); - - if read_from_stdin && paths.contains(&STDIN_REPR) { - return Err(WcError::StdinReprNotAllowed.into()); - } - - Ok(paths.iter().map(OsStr::new).map(Input::from).collect()) -} - fn word_count_from_reader( mut reader: T, settings: &Settings, @@ -555,109 +652,166 @@ enum CountResult { Failure(io::Error), } -/// If we fail opening a file we only show the error. If we fail reading it -/// we show a count for what we managed to read. +/// If we fail opening a file, we only show the error. If we fail reading the +/// file, we show a count for what we managed to read. /// -/// Therefore the reading implementations always return a total and sometimes +/// Therefore, the reading implementations always return a total and sometimes /// return an error: (WordCount, Option). -fn word_count_from_input(input: &Input, settings: &Settings) -> CountResult { - match input { - Input::Stdin(_) => { - let stdin = io::stdin(); - let stdin_lock = stdin.lock(); - let count = word_count_from_reader(stdin_lock, settings); - match count { - (total, Some(error)) => CountResult::Interrupted(total, error), - (total, None) => CountResult::Success(total), - } - } +fn word_count_from_input(input: &Input<'_>, settings: &Settings) -> CountResult { + let (total, maybe_err) = match input { + Input::Stdin(_) => word_count_from_reader(io::stdin().lock(), settings), Input::Path(path) => match File::open(path) { - Err(error) => CountResult::Failure(error), - Ok(file) => match word_count_from_reader(file, settings) { - (total, Some(error)) => CountResult::Interrupted(total, error), - (total, None) => CountResult::Success(total), - }, + Ok(f) => word_count_from_reader(f, settings), + Err(err) => return CountResult::Failure(err), }, + }; + match maybe_err { + None => CountResult::Success(total), + Some(err) => CountResult::Interrupted(total, err), } } /// Compute the number of digits needed to represent all counts in all inputs. /// -/// `inputs` may include zero or more [`Input::Stdin`] entries, each of -/// which represents reading from `stdin`. The presence of any such -/// entry causes this function to return a width that is at least -/// [`MINIMUM_WIDTH`]. +/// For [`Inputs::Stdin`], [`MINIMUM_WIDTH`] is returned, unless there is only one counter number +/// to be printed, in which case 1 is returned. /// -/// If `input` is empty, or if only one number needs to be printed (for just -/// one file) then this function is optimized to return 1 without making any -/// calls to get file metadata. +/// For [`Inputs::Files0From`], [`MINIMUM_WIDTH`] is returned. /// -/// If file metadata could not be read from any of the [`Input::Path`] input, -/// that input does not affect number width computation +/// An [`Inputs::Paths`] may include zero or more "-" entries, each of which represents reading +/// from `stdin`. The presence of any such entry causes this function to return a width that is at +/// least [`MINIMUM_WIDTH`]. /// -/// Otherwise, the file sizes in the file metadata are summed and the number of -/// digits in that total size is returned as the number width +/// If an [`Inputs::Paths`] contains only one path and only one number needs to be printed then +/// this function is optimized to return 1 without making any calls to get file metadata. /// -/// To mirror GNU wc's behavior a special case is added. If --files0-from is -/// used and input is read from stdin and there is only one calculation enabled -/// columns will not be aligned. This is not exactly GNU wc's behavior, but it -/// is close enough to pass the GNU test suite. -fn compute_number_width(inputs: &[Input], settings: &Settings) -> usize { - if inputs.is_empty() - || settings.number_enabled() == 1 - && (inputs.len() == 1 - || settings.files0_from_path.as_ref().map(|p| p.as_os_str()) - == Some(OsStr::new(STDIN_REPR))) - { - return 1; - } - - let mut minimum_width = 1; - let mut total = 0; - - for input in inputs { - match input { - Input::Stdin(_) => { - minimum_width = MINIMUM_WIDTH; +/// If file metadata could not be read from any of the [`Input::Path`] input, that input does not +/// affect number width computation. Otherwise, the file sizes from the files' metadata are summed +/// and the number of digits in that total size is returned. +fn compute_number_width(inputs: &Inputs, settings: &Settings) -> usize { + match inputs { + Inputs::Stdin if settings.number_enabled() == 1 => 1, + Inputs::Stdin => MINIMUM_WIDTH, + Inputs::Files0From(_) => 1, + Inputs::Paths(inputs) => { + if settings.number_enabled() == 1 && inputs.len() == 1 { + return 1; } - Input::Path(path) => { - if let Ok(meta) = fs::metadata(path) { - if meta.is_file() { - total += meta.len(); - } else { - minimum_width = MINIMUM_WIDTH; + + let mut minimum_width = 1; + let mut total: u64 = 0; + for input in inputs.iter() { + match input { + Input::Stdin(_) => minimum_width = MINIMUM_WIDTH, + Input::Path(path) => { + if let Ok(meta) = fs::metadata(path) { + if meta.is_file() { + total += meta.len(); + } else { + minimum_width = MINIMUM_WIDTH; + } + } } } } - } - } - if total == 0 { - minimum_width - } else { - let total_width = (1 + ilog10_u64(total)) - .try_into() - .expect("ilog of a u64 should fit into a usize"); - max(total_width, minimum_width) + if total == 0 { + minimum_width + } else { + let total_width = (1 + ilog10_u64(total)) + .try_into() + .expect("ilog of a u64 should fit into a usize"); + max(total_width, minimum_width) + } + } } } -fn wc(inputs: &[Input], settings: &Settings) -> UResult<()> { +type InputIterItem<'a> = Result, Box>; + +/// To be used with `--files0-from=-`, this applies a filter on the results of files0_iter to +/// translate '-' into the appropriate error. +fn files0_iter_stdin<'a>() -> impl Iterator> { + files0_iter(io::stdin().lock(), STDIN_REPR.into()).map(|i| match i { + Ok(Input::Stdin(_)) => Err(WcError::StdinReprNotAllowed.into()), + _ => i, + }) +} + +fn files0_iter_file<'a>(path: &Path) -> UResult>> { + match File::open(path) { + Ok(f) => Ok(files0_iter(f, path.into())), + Err(e) => Err(e.map_err_context(|| { + format!( + "cannot open {} for reading", + escape_name(path.as_os_str(), QS_QUOTE_ESCAPE) + ) + })), + } +} + +fn files0_iter<'a>( + r: impl io::Read + 'static, + err_path: OsString, +) -> impl Iterator> { + use std::io::BufRead; + let mut i = Some( + io::BufReader::new(r) + .split(b'\0') + .map(move |res| match res { + Ok(p) if p == STDIN_REPR.as_bytes() => Ok(Input::Stdin(StdinKind::Explicit)), + Ok(p) => { + // On Unix systems, OsStrings are just strings of bytes, not necessarily UTF-8. + #[cfg(unix)] + { + use std::os::unix::ffi::OsStringExt; + Ok(Input::Path(PathBuf::from(OsString::from_vec(p)).into())) + } + + // ...Windows does not, we must go through Strings. + #[cfg(not(unix))] + { + let s = String::from_utf8(p) + .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; + Ok(Input::Path(PathBuf::from(s).into())) + } + } + Err(e) => Err(e.map_err_context(|| { + format!("{}: read error", escape_name(&err_path, QS_ESCAPE)) + }) as Box), + }), + ); + // Loop until there is an error; yield that error and then nothing else. + std::iter::from_fn(move || { + let next = i.as_mut().and_then(Iterator::next); + if matches!(next, Some(Err(_)) | None) { + i = None; + } + next + }) +} + +fn wc(inputs: &Inputs, settings: &Settings) -> UResult<()> { let mut total_word_count = WordCount::default(); + let mut num_inputs: usize = 0; let (number_width, are_stats_visible) = match settings.total_when { TotalWhen::Only => (1, false), _ => (compute_number_width(inputs, settings), true), }; - let is_total_row_visible = settings.total_when.is_total_row_visible(inputs.len()); - for (idx, input) in inputs.iter().enumerate() { - if matches!(input, Input::Path(p) if p.as_os_str().is_empty()) { - let err = WcError::zero_len(settings.files0_from_path.as_deref().map(|s| (s, idx))); - show!(err); - continue; - } - let word_count = match word_count_from_input(input, settings) { + for maybe_input in inputs.try_iter(settings)? { + num_inputs += 1; + + let input = match maybe_input { + Ok(input) => input, + Err(err) => { + show!(err); + continue; + } + }; + + let word_count = match word_count_from_input(&input, settings) { CountResult::Success(word_count) => word_count, CountResult::Interrupted(word_count, err) => { show!(err.map_err_context(|| input.path_display())); @@ -679,7 +833,7 @@ fn wc(inputs: &[Input], settings: &Settings) -> UResult<()> { } } - if is_total_row_visible { + if settings.total_when.is_total_row_visible(num_inputs) { let title = are_stats_visible.then_some("total"); if let Err(err) = print_stats(settings, &total_word_count, title, number_width) { show!(err.map_err_context(|| "failed to print total".into())); diff --git a/tests/by-util/test_wc.rs b/tests/by-util/test_wc.rs index 1ce27f18d..01c36b7c0 100644 --- a/tests/by-util/test_wc.rs +++ b/tests/by-util/test_wc.rs @@ -424,10 +424,12 @@ fn test_files0_from() { new_ucmd!() .args(&["--files0-from=files0_list.txt"]) .run() - .stdout_is( - " 13 109 772 lorem_ipsum.txt\n 18 204 1115 moby_dick.txt\n 5 57 302 \ - alice_in_wonderland.txt\n 36 370 2189 total\n", - ); + .stdout_is(concat!( + " 13 109 772 lorem_ipsum.txt\n", + " 18 204 1115 moby_dick.txt\n", + " 5 57 302 alice_in_wonderland.txt\n", + " 36 370 2189 total\n", + )); } #[test] @@ -436,7 +438,7 @@ fn test_files0_from_with_stdin() { .args(&["--files0-from=-"]) .pipe_in("lorem_ipsum.txt") .run() - .stdout_is(" 13 109 772 lorem_ipsum.txt\n"); + .stdout_is("13 109 772 lorem_ipsum.txt\n"); } #[test] @@ -445,10 +447,12 @@ fn test_files0_from_with_stdin_in_file() { .args(&["--files0-from=files0_list_with_stdin.txt"]) .pipe_in_fixture("alice_in_wonderland.txt") .run() - .stdout_is( - " 13 109 772 lorem_ipsum.txt\n 18 204 1115 moby_dick.txt\n 5 57 302 \ - -\n 36 370 2189 total\n", - ); + .stdout_is(concat!( + " 13 109 772 lorem_ipsum.txt\n", + " 18 204 1115 moby_dick.txt\n", + " 5 57 302 -\n", + " 36 370 2189 total\n", + )); } #[test]