From ba8f4ea67041c500a2ca55fc09e87408d8a531f2 Mon Sep 17 00:00:00 2001 From: Jeffrey Finkelstein Date: Tue, 4 May 2021 22:13:28 -0400 Subject: [PATCH] wc: move counting code into WordCount::from_line() Refactor the counting code from the inner loop of the `wc` program into the `WordCount::from_line()` associated function. This commit also splits that function up into other helper functions that encapsulate decoding characters and finding word boundaries from raw bytes. This commit also implements the `Sum` trait for the `WordCount` struct, so that we can simply call `sum()` on an iterator that yields `WordCount` instances. --- src/uu/wc/src/wc.rs | 73 +++++---------------------------- src/uu/wc/src/wordcount.rs | 84 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 62 deletions(-) diff --git a/src/uu/wc/src/wc.rs b/src/uu/wc/src/wc.rs index 8e973ccbd..33b2ba5ec 100644 --- a/src/uu/wc/src/wc.rs +++ b/src/uu/wc/src/wc.rs @@ -24,7 +24,6 @@ use std::cmp::max; use std::fs::File; use std::io::{self, Write}; use std::path::Path; -use std::str::from_utf8; #[derive(Error, Debug)] pub enum WcError { @@ -163,18 +162,6 @@ pub fn uumain(args: impl uucore::Args) -> i32 { } } -const CR: u8 = b'\r'; -const LF: u8 = b'\n'; -const SPACE: u8 = b' '; -const TAB: u8 = b'\t'; -const SYN: u8 = 0x16_u8; -const FF: u8 = 0x0C_u8; - -#[inline(always)] -fn is_word_separator(byte: u8) -> bool { - byte == SPACE || byte == TAB || byte == CR || byte == SYN || byte == FF -} - fn word_count_from_reader( mut reader: T, settings: &Settings, @@ -195,58 +182,20 @@ fn word_count_from_reader( // we do not need to decode the byte stream if we're only counting bytes/newlines let decode_chars = settings.show_chars || settings.show_words || settings.show_max_line_length; - let mut line_count: usize = 0; - let mut word_count: usize = 0; - let mut byte_count: usize = 0; - let mut char_count: usize = 0; - let mut longest_line_length: usize = 0; - let mut ends_lf: bool; - - // reading from a TTY seems to raise a condition on, rather than return Some(0) like a file. - // hence the option wrapped in a result here - for line_result in reader.lines() { - let raw_line = match line_result { - Ok(l) => l, + // Sum the WordCount for each line. Show a warning for each line + // that results in an IO error when trying to read it. + let total = reader + .lines() + .filter_map(|res| match res { + Ok(line) => Some(line), Err(e) => { show_warning!("Error while reading {}: {}", path, e); - continue; + None } - }; - - // GNU 'wc' only counts lines that end in LF as lines - ends_lf = *raw_line.last().unwrap() == LF; - line_count += ends_lf as usize; - - byte_count += raw_line.len(); - - if decode_chars { - // try and convert the bytes to UTF-8 first - let current_char_count; - match from_utf8(&raw_line[..]) { - Ok(line) => { - word_count += line.split_whitespace().count(); - current_char_count = line.chars().count(); - } - Err(..) => { - word_count += raw_line.split(|&x| is_word_separator(x)).count(); - current_char_count = raw_line.iter().filter(|c| c.is_ascii()).count() - } - } - char_count += current_char_count; - if current_char_count > longest_line_length { - // -L is a GNU 'wc' extension so same behavior on LF - longest_line_length = current_char_count - (ends_lf as usize); - } - } - } - - Ok(WordCount { - bytes: byte_count, - chars: char_count, - lines: line_count, - words: word_count, - max_line_length: longest_line_length, - }) + }) + .map(|line| WordCount::from_line(&line, decode_chars)) + .sum(); + Ok(total) } fn word_count_from_path(path: &str, settings: &Settings) -> WcResult { diff --git a/src/uu/wc/src/wordcount.rs b/src/uu/wc/src/wordcount.rs index 38efb216f..785e57eff 100644 --- a/src/uu/wc/src/wordcount.rs +++ b/src/uu/wc/src/wordcount.rs @@ -1,5 +1,19 @@ use std::cmp::max; +use std::iter::Sum; use std::ops::{Add, AddAssign}; +use std::str::from_utf8; + +const CR: u8 = b'\r'; +const LF: u8 = b'\n'; +const SPACE: u8 = b' '; +const TAB: u8 = b'\t'; +const SYN: u8 = 0x16_u8; +const FF: u8 = 0x0C_u8; + +#[inline(always)] +fn is_word_separator(byte: u8) -> bool { + byte == SPACE || byte == TAB || byte == CR || byte == SYN || byte == FF +} #[derive(Debug, Default, Copy, Clone)] pub struct WordCount { @@ -30,10 +44,80 @@ impl AddAssign for WordCount { } } +impl Sum for WordCount { + fn sum(iter: I) -> WordCount + where + I: Iterator, + { + iter.fold(WordCount::default(), |acc, x| acc + x) + } +} + impl WordCount { + /// Count the characters and whitespace-separated words in the given bytes. + /// + /// `line` is a slice of bytes that will be decoded as ASCII characters. + fn ascii_word_and_char_count(line: &[u8]) -> (usize, usize) { + let word_count = line.split(|&x| is_word_separator(x)).count(); + let char_count = line.iter().filter(|c| c.is_ascii()).count(); + (word_count, char_count) + } + + /// Create a [`WordCount`] from a sequence of bytes representing a line. + /// + /// If the last byte of `line` encodes a newline character (`\n`), + /// then the [`lines`] field will be set to 1. Otherwise, it will + /// be set to 0. The [`bytes`] field is simply the length of + /// `line`. + /// + /// If `decode_chars` is `false`, the [`chars`] and [`words`] + /// fields will be set to 0. If it is `true`, this function will + /// attempt to decode the bytes first as UTF-8, and failing that, + /// as ASCII. + pub fn from_line(line: &[u8], decode_chars: bool) -> WordCount { + // GNU 'wc' only counts lines that end in LF as lines + let lines = (*line.last().unwrap() == LF) as usize; + let bytes = line.len(); + let (words, chars) = if decode_chars { + WordCount::word_and_char_count(line) + } else { + (0, 0) + }; + // -L is a GNU 'wc' extension so same behavior on LF + let max_line_length = if chars > 0 { chars - lines } else { 0 }; + WordCount { + bytes, + chars, + lines, + words, + max_line_length, + } + } + + /// Count the UTF-8 characters and words in the given string slice. + /// + /// `s` is a string slice that is assumed to be a UTF-8 string. + fn utf8_word_and_char_count(s: &str) -> (usize, usize) { + let word_count = s.split_whitespace().count(); + let char_count = s.chars().count(); + (word_count, char_count) + } + pub fn with_title(self, title: &str) -> TitledWordCount { TitledWordCount { title, count: self } } + + /// Count the characters and words in the given slice of bytes. + /// + /// `line` is a slice of bytes that will be decoded as UTF-8 + /// characters, or if that fails, as ASCII characters. + fn word_and_char_count(line: &[u8]) -> (usize, usize) { + // try and convert the bytes to UTF-8 first + match from_utf8(line) { + Ok(s) => WordCount::utf8_word_and_char_count(s), + Err(..) => WordCount::ascii_word_and_char_count(line), + } + } } /// This struct supplements the actual word count with a title that is displayed