From 97a49c7c95ec51839650b3b95ebff7d44ea401db Mon Sep 17 00:00:00 2001 From: Jeffrey Finkelstein Date: Fri, 7 May 2021 15:07:17 -0400 Subject: [PATCH] wc: compute min width to format counts up front Fix two issues with the string formatting width for counts displayed by `wc`. First, the output was previously not using the default minimum width (seven characters) when reading from `stdin`. This commit corrects this behavior to match GNU `wc`. For example, $ cat alice_in_wonderland.txt | wc 5 57 302 Second, if at least 10^7 bytes were read from `stdin` *after* reading from a smaller regular file, then every output row would have width 8. This disagrees with GNU `wc`, in which only the `stdin` row and the total row would have width 8. This commit corrects this behavior to match GNU `wc`. For example, $ printf "%.0s0" {1..10000000} | wc emptyfile.txt - 0 0 0 emptyfile.txt 0 1 10000000 0 1 10000000 total Fixes #2186. --- src/uu/wc/src/wc.rs | 106 +++++++++++++++++++++++++++++++++++---- tests/by-util/test_wc.rs | 38 +++++++++++--- 2 files changed, 128 insertions(+), 16 deletions(-) diff --git a/src/uu/wc/src/wc.rs b/src/uu/wc/src/wc.rs index 5670508f4..b323f7261 100644 --- a/src/uu/wc/src/wc.rs +++ b/src/uu/wc/src/wc.rs @@ -20,11 +20,13 @@ use wordcount::{TitledWordCount, WordCount}; use clap::{App, Arg, ArgMatches}; use thiserror::Error; -use std::cmp::max; -use std::fs::File; +use std::fs::{self, File}; use std::io::{self, ErrorKind, Write}; use std::path::Path; +/// The minimum character width for formatting counts when reading from stdin. +const MINIMUM_WIDTH: usize = 7; + #[derive(Error, Debug)] pub enum WcError { #[error("{0}")] @@ -277,11 +279,101 @@ fn show_error(input: &Input, err: WcError) { }; } +/// Compute the number of digits needed to represent any count for this input. +/// +/// If `input` is [`Input::Stdin`], then this function returns +/// [`MINIMUM_WIDTH`]. Otherwise, if metadata could not be read from +/// `input` then this function returns 1. +/// +/// # Errors +/// +/// This function will return an error if `input` is a [`Input::Path`] +/// and there is a problem accessing the metadata of the given `input`. +/// +/// # Examples +/// +/// A [`Input::Stdin`] gets a default minimum width: +/// +/// ```rust,ignore +/// let input = Input::Stdin(StdinKind::Explicit); +/// assert_eq!(7, digit_width(input)); +/// ``` +fn digit_width(input: &Input) -> WcResult> { + match input { + Input::Stdin(_) => Ok(Some(MINIMUM_WIDTH)), + Input::Path(filename) => { + let path = Path::new(filename); + let metadata = fs::metadata(path)?; + if metadata.is_file() { + // TODO We are now computing the number of bytes in a file + // twice: once here and once in `WordCount::from_line()` (or + // in `count_bytes_fast()` if that function is called + // instead). See GitHub issue #2201. + let num_bytes = metadata.len(); + let num_digits = num_bytes.to_string().len(); + Ok(Some(num_digits)) + } else { + Ok(None) + } + } + } +} + +/// Compute the number of digits needed to represent all counts in all inputs. +/// +/// `inputs` may include zero or more [`Input::Stdin`] entries, each of +/// which represents reading from `stdin`. The presence of any such +/// entry causes this function to return a width that is at least +/// [`MINIMUM_WIDTH`]. +/// +/// If `input` is empty, then this function returns 1. If file metadata +/// could not be read from any of the [`Input::Path`] inputs and there +/// are no [`Input::Stdin`] inputs, then this function returns 1. +/// +/// If there is a problem accessing the metadata, this function will +/// silently ignore the error and assume that the number of digits +/// needed to display the counts for that file is 1. +/// +/// # Examples +/// +/// An empty slice implies a width of 1: +/// +/// ```rust,ignore +/// assert_eq!(1, max_width(&vec![])); +/// ``` +/// +/// The presence of [`Input::Stdin`] implies a minimum width: +/// +/// ```rust,ignore +/// let inputs = vec![Input::Stdin(StdinKind::Explicit)]; +/// assert_eq!(7, max_width(&inputs)); +/// ``` +fn max_width(inputs: &[Input]) -> usize { + let mut result = 1; + for input in inputs { + match digit_width(input) { + Ok(maybe_n) => { + if let Some(n) = maybe_n { + result = result.max(n); + } + } + Err(_) => continue, + } + } + result +} + fn wc(inputs: Vec, settings: &Settings) -> Result<(), u32> { + // Compute the width, in digits, to use when formatting counts. + // + // The width is the number of digits needed to print the number of + // bytes in the largest file. This is true regardless of whether + // the `settings` indicate that the bytes will be displayed. + let mut error_count = 0; + let max_width = max_width(&inputs); + let mut total_word_count = WordCount::default(); let mut results = vec![]; - let mut max_width: usize = 0; - let mut error_count = 0; let num_inputs = inputs.len(); @@ -291,12 +383,6 @@ fn wc(inputs: Vec, settings: &Settings) -> Result<(), u32> { error_count += 1; WordCount::default() }); - // Compute the number of digits needed to display the number - // of bytes in the file. Even if the settings indicate that we - // won't *display* the number of bytes, we still use the - // number of digits in the byte count as the width when - // formatting each count as a string for output. - max_width = max(max_width, word_count.bytes.to_string().len()); total_word_count += word_count; results.push(word_count.with_title(input.to_title())); } diff --git a/tests/by-util/test_wc.rs b/tests/by-util/test_wc.rs index 8036d0eaa..1203c0b1d 100644 --- a/tests/by-util/test_wc.rs +++ b/tests/by-util/test_wc.rs @@ -33,7 +33,7 @@ fn test_stdin_default() { new_ucmd!() .pipe_in_fixture("lorem_ipsum.txt") .run() - .stdout_is(" 13 109 772\n"); + .stdout_is(" 13 109 772\n"); } #[test] @@ -42,7 +42,7 @@ fn test_stdin_explicit() { .pipe_in_fixture("lorem_ipsum.txt") .arg("-") .run() - .stdout_is(" 13 109 772 -\n"); + .stdout_is(" 13 109 772 -\n"); } #[test] @@ -51,9 +51,11 @@ fn test_utf8() { .args(&["-lwmcL"]) .pipe_in_fixture("UTF_8_test.txt") .run() - .stdout_is(" 300 4969 22781 22213 79\n"); - // GNU returns " 300 2086 22219 22781 79" - // TODO: we should fix that to match GNU's behavior + .stdout_is(" 300 4969 22781 22213 79\n"); + // GNU returns " 300 2086 22219 22781 79" + // + // TODO: we should fix the word, character, and byte count to + // match the behavior of GNU wc } #[test] @@ -80,7 +82,7 @@ fn test_stdin_all_counts() { .args(&["-c", "-m", "-l", "-L", "-w"]) .pipe_in_fixture("alice_in_wonderland.txt") .run() - .stdout_is(" 5 57 302 302 66\n"); + .stdout_is(" 5 57 302 302 66\n"); } #[test] @@ -169,6 +171,30 @@ fn test_file_one_long_word() { .stdout_is(" 1 1 10001 10001 10000 onelongword.txt\n"); } +/// Test that the number of bytes in the file dictate the display width. +/// +/// The width in digits of any count is the width in digits of the +/// number of bytes in the file, regardless of whether the number of +/// bytes are displayed. +#[test] +fn test_file_bytes_dictate_width() { + // This file has 10,001 bytes. Five digits are required to + // represent that. Even though the number of lines is 1 and the + // number of words is 0, each of those counts is formatted with + // five characters, filled with whitespace. + new_ucmd!() + .args(&["-lw", "onelongemptyline.txt"]) + .run() + .stdout_is(" 1 0 onelongemptyline.txt\n"); + + // This file has zero bytes. Only one digit is required to + // represent that. + new_ucmd!() + .args(&["-lw", "emptyfile.txt"]) + .run() + .stdout_is("0 0 emptyfile.txt\n"); +} + /// Test that getting counts from a directory is an error. #[test] fn test_read_from_directory_error() {