1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-28 03:27:44 +00:00

wc: compute min width to format counts up front

Fix two issues with the string formatting width for counts displayed
by `wc`.

First, the output was previously not using the default minimum width
(seven characters) when reading from `stdin`. This commit corrects
this behavior to match GNU `wc`. For example,

    $ cat alice_in_wonderland.txt | wc
          5      57     302

Second, if at least 10^7 bytes were read from `stdin` *after* reading
from a smaller regular file, then every output row would have width
8. This disagrees with GNU `wc`, in which only the `stdin` row and the
total row would have width 8. This commit corrects this behavior to
match GNU `wc`. For example,

    $ printf "%.0s0" {1..10000000} | wc emptyfile.txt -
	  0       0       0 emptyfile.txt
	  0       1 10000000
	  0       1 10000000 total

Fixes #2186.
This commit is contained in:
Jeffrey Finkelstein 2021-05-07 15:07:17 -04:00
parent 620a5a5df6
commit 97a49c7c95
2 changed files with 128 additions and 16 deletions

View file

@ -20,11 +20,13 @@ use wordcount::{TitledWordCount, WordCount};
use clap::{App, Arg, ArgMatches};
use thiserror::Error;
use std::cmp::max;
use std::fs::File;
use std::fs::{self, File};
use std::io::{self, ErrorKind, Write};
use std::path::Path;
/// The minimum character width for formatting counts when reading from stdin.
const MINIMUM_WIDTH: usize = 7;
#[derive(Error, Debug)]
pub enum WcError {
#[error("{0}")]
@ -277,11 +279,101 @@ fn show_error(input: &Input, err: WcError) {
};
}
/// Compute the number of digits needed to represent any count for this input.
///
/// If `input` is [`Input::Stdin`], then this function returns
/// [`MINIMUM_WIDTH`]. Otherwise, if metadata could not be read from
/// `input` then this function returns 1.
///
/// # Errors
///
/// This function will return an error if `input` is a [`Input::Path`]
/// and there is a problem accessing the metadata of the given `input`.
///
/// # Examples
///
/// A [`Input::Stdin`] gets a default minimum width:
///
/// ```rust,ignore
/// let input = Input::Stdin(StdinKind::Explicit);
/// assert_eq!(7, digit_width(input));
/// ```
fn digit_width(input: &Input) -> WcResult<Option<usize>> {
match input {
Input::Stdin(_) => Ok(Some(MINIMUM_WIDTH)),
Input::Path(filename) => {
let path = Path::new(filename);
let metadata = fs::metadata(path)?;
if metadata.is_file() {
// TODO We are now computing the number of bytes in a file
// twice: once here and once in `WordCount::from_line()` (or
// in `count_bytes_fast()` if that function is called
// instead). See GitHub issue #2201.
let num_bytes = metadata.len();
let num_digits = num_bytes.to_string().len();
Ok(Some(num_digits))
} else {
Ok(None)
}
}
}
}
/// Compute the number of digits needed to represent all counts in all inputs.
///
/// `inputs` may include zero or more [`Input::Stdin`] entries, each of
/// which represents reading from `stdin`. The presence of any such
/// entry causes this function to return a width that is at least
/// [`MINIMUM_WIDTH`].
///
/// If `input` is empty, then this function returns 1. If file metadata
/// could not be read from any of the [`Input::Path`] inputs and there
/// are no [`Input::Stdin`] inputs, then this function returns 1.
///
/// If there is a problem accessing the metadata, this function will
/// silently ignore the error and assume that the number of digits
/// needed to display the counts for that file is 1.
///
/// # Examples
///
/// An empty slice implies a width of 1:
///
/// ```rust,ignore
/// assert_eq!(1, max_width(&vec![]));
/// ```
///
/// The presence of [`Input::Stdin`] implies a minimum width:
///
/// ```rust,ignore
/// let inputs = vec![Input::Stdin(StdinKind::Explicit)];
/// assert_eq!(7, max_width(&inputs));
/// ```
fn max_width(inputs: &[Input]) -> usize {
let mut result = 1;
for input in inputs {
match digit_width(input) {
Ok(maybe_n) => {
if let Some(n) = maybe_n {
result = result.max(n);
}
}
Err(_) => continue,
}
}
result
}
fn wc(inputs: Vec<Input>, settings: &Settings) -> Result<(), u32> {
// Compute the width, in digits, to use when formatting counts.
//
// The width is the number of digits needed to print the number of
// bytes in the largest file. This is true regardless of whether
// the `settings` indicate that the bytes will be displayed.
let mut error_count = 0;
let max_width = max_width(&inputs);
let mut total_word_count = WordCount::default();
let mut results = vec![];
let mut max_width: usize = 0;
let mut error_count = 0;
let num_inputs = inputs.len();
@ -291,12 +383,6 @@ fn wc(inputs: Vec<Input>, settings: &Settings) -> Result<(), u32> {
error_count += 1;
WordCount::default()
});
// Compute the number of digits needed to display the number
// of bytes in the file. Even if the settings indicate that we
// won't *display* the number of bytes, we still use the
// number of digits in the byte count as the width when
// formatting each count as a string for output.
max_width = max(max_width, word_count.bytes.to_string().len());
total_word_count += word_count;
results.push(word_count.with_title(input.to_title()));
}

View file

@ -33,7 +33,7 @@ fn test_stdin_default() {
new_ucmd!()
.pipe_in_fixture("lorem_ipsum.txt")
.run()
.stdout_is(" 13 109 772\n");
.stdout_is(" 13 109 772\n");
}
#[test]
@ -42,7 +42,7 @@ fn test_stdin_explicit() {
.pipe_in_fixture("lorem_ipsum.txt")
.arg("-")
.run()
.stdout_is(" 13 109 772 -\n");
.stdout_is(" 13 109 772 -\n");
}
#[test]
@ -51,9 +51,11 @@ fn test_utf8() {
.args(&["-lwmcL"])
.pipe_in_fixture("UTF_8_test.txt")
.run()
.stdout_is(" 300 4969 22781 22213 79\n");
// GNU returns " 300 2086 22219 22781 79"
// TODO: we should fix that to match GNU's behavior
.stdout_is(" 300 4969 22781 22213 79\n");
// GNU returns " 300 2086 22219 22781 79"
//
// TODO: we should fix the word, character, and byte count to
// match the behavior of GNU wc
}
#[test]
@ -80,7 +82,7 @@ fn test_stdin_all_counts() {
.args(&["-c", "-m", "-l", "-L", "-w"])
.pipe_in_fixture("alice_in_wonderland.txt")
.run()
.stdout_is(" 5 57 302 302 66\n");
.stdout_is(" 5 57 302 302 66\n");
}
#[test]
@ -169,6 +171,30 @@ fn test_file_one_long_word() {
.stdout_is(" 1 1 10001 10001 10000 onelongword.txt\n");
}
/// Test that the number of bytes in the file dictate the display width.
///
/// The width in digits of any count is the width in digits of the
/// number of bytes in the file, regardless of whether the number of
/// bytes are displayed.
#[test]
fn test_file_bytes_dictate_width() {
// This file has 10,001 bytes. Five digits are required to
// represent that. Even though the number of lines is 1 and the
// number of words is 0, each of those counts is formatted with
// five characters, filled with whitespace.
new_ucmd!()
.args(&["-lw", "onelongemptyline.txt"])
.run()
.stdout_is(" 1 0 onelongemptyline.txt\n");
// This file has zero bytes. Only one digit is required to
// represent that.
new_ucmd!()
.args(&["-lw", "emptyfile.txt"])
.run()
.stdout_is("0 0 emptyfile.txt\n");
}
/// Test that getting counts from a directory is an error.
#[test]
fn test_read_from_directory_error() {