wc: move counting code into WordCount::from_line()

Refactor the counting code from the inner loop of the `wc` program into the `WordCount::from_line()` associated function. This commit also splits that function up into other helper functions that encapsulate decoding characters and finding word boundaries from raw bytes. This commit also implements the `Sum` trait for the `WordCount` struct, so that we can simply call `sum()` on an iterator that yields `WordCount` instances.
2025-07-29 03:57:44 +00:00 · 2021-05-04 22:13:28 -04:00 · 2021-05-04 22:13:28 -04:00 · ba8f4ea670
commit ba8f4ea670
parent 50f4941d49
2 changed files with 95 additions and 62 deletions
--- a/src/uu/wc/src/wc.rs
+++ b/src/uu/wc/src/wc.rs
@ -24,7 +24,6 @@ use std::cmp::max;
 use std::fs::File;
 use std::io::{self, Write};
 use std::path::Path;
 use std::str::from_utf8;
 #[derive(Error, Debug)]
 pub enum WcError {
@ -163,18 +162,6 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
    }
 }
 const CR: u8 = b'\r';
 const LF: u8 = b'\n';
 const SPACE: u8 = b' ';
 const TAB: u8 = b'\t';
 const SYN: u8 = 0x16_u8;
 const FF: u8 = 0x0C_u8;
 #[inline(always)]
 fn is_word_separator(byte: u8) -> bool {
    byte == SPACE || byte == TAB || byte == CR || byte == SYN || byte == FF
 }
 fn word_count_from_reader<T: WordCountable>(
    mut reader: T,
    settings: &Settings,
@ -195,58 +182,20 @@ fn word_count_from_reader<T: WordCountable>(
    // we do not need to decode the byte stream if we're only counting bytes/newlines
    let decode_chars = settings.show_chars || settings.show_words || settings.show_max_line_length;
-    let mut line_count: usize = 0;
+    // Sum the WordCount for each line. Show a warning for each line
-    let mut word_count: usize = 0;
+    // that results in an IO error when trying to read it.
-    let mut byte_count: usize = 0;
+    let total = reader
-    let mut char_count: usize = 0;
+        .lines()
-    let mut longest_line_length: usize = 0;
+        .filter_map(|res| match res {
-    let mut ends_lf: bool;
+            Ok(line) => Some(line),
    // reading from a TTY seems to raise a condition on, rather than return Some(0) like a file.
    // hence the option wrapped in a result here
    for line_result in reader.lines() {
        let raw_line = match line_result {
            Ok(l) => l,
            Err(e) => {
                show_warning!("Error while reading {}: {}", path, e);
-                continue;
+                None
            }
-        };
+        })
-
+        .map(|line| WordCount::from_line(&line, decode_chars))
-        // GNU 'wc' only counts lines that end in LF as lines
+        .sum();
-        ends_lf = *raw_line.last().unwrap() == LF;
+    Ok(total)
        line_count += ends_lf as usize;
        byte_count += raw_line.len();
        if decode_chars {
            // try and convert the bytes to UTF-8 first
            let current_char_count;
            match from_utf8(&raw_line[..]) {
                Ok(line) => {
                    word_count += line.split_whitespace().count();
                    current_char_count = line.chars().count();
                }
                Err(..) => {
                    word_count += raw_line.split(|&x| is_word_separator(x)).count();
                    current_char_count = raw_line.iter().filter(|c| c.is_ascii()).count()
                }
            }
            char_count += current_char_count;
            if current_char_count > longest_line_length {
                // -L is a GNU 'wc' extension so same behavior on LF
                longest_line_length = current_char_count - (ends_lf as usize);
            }
        }
    }
    Ok(WordCount {
        bytes: byte_count,
        chars: char_count,
        lines: line_count,
        words: word_count,
        max_line_length: longest_line_length,
    })
 }
 fn word_count_from_path(path: &str, settings: &Settings) -> WcResult<WordCount> {
--- a/src/uu/wc/src/wordcount.rs
+++ b/src/uu/wc/src/wordcount.rs
@ -1,5 +1,19 @@
 use std::cmp::max;
 use std::iter::Sum;
 use std::ops::{Add, AddAssign};
 use std::str::from_utf8;
 const CR: u8 = b'\r';
 const LF: u8 = b'\n';
 const SPACE: u8 = b' ';
 const TAB: u8 = b'\t';
 const SYN: u8 = 0x16_u8;
 const FF: u8 = 0x0C_u8;
 #[inline(always)]
 fn is_word_separator(byte: u8) -> bool {
    byte == SPACE || byte == TAB || byte == CR || byte == SYN || byte == FF
 }
 #[derive(Debug, Default, Copy, Clone)]
 pub struct WordCount {
@ -30,10 +44,80 @@ impl AddAssign for WordCount {
    }
 }
 impl Sum for WordCount {
    fn sum<I>(iter: I) -> WordCount
    where
        I: Iterator<Item = WordCount>,
    {
        iter.fold(WordCount::default(), |acc, x| acc + x)
    }
 }
 impl WordCount {
    /// Count the characters and whitespace-separated words in the given bytes.
    ///
    /// `line` is a slice of bytes that will be decoded as ASCII characters.
    fn ascii_word_and_char_count(line: &[u8]) -> (usize, usize) {
        let word_count = line.split(|&x| is_word_separator(x)).count();
        let char_count = line.iter().filter(|c| c.is_ascii()).count();
        (word_count, char_count)
    }
    /// Create a [`WordCount`] from a sequence of bytes representing a line.
    ///
    /// If the last byte of `line` encodes a newline character (`\n`),
    /// then the [`lines`] field will be set to 1. Otherwise, it will
    /// be set to 0. The [`bytes`] field is simply the length of
    /// `line`.
    ///
    /// If `decode_chars` is `false`, the [`chars`] and [`words`]
    /// fields will be set to 0. If it is `true`, this function will
    /// attempt to decode the bytes first as UTF-8, and failing that,
    /// as ASCII.
    pub fn from_line(line: &[u8], decode_chars: bool) -> WordCount {
        // GNU 'wc' only counts lines that end in LF as lines
        let lines = (*line.last().unwrap() == LF) as usize;
        let bytes = line.len();
        let (words, chars) = if decode_chars {
            WordCount::word_and_char_count(line)
        } else {
            (0, 0)
        };
        // -L is a GNU 'wc' extension so same behavior on LF
        let max_line_length = if chars > 0 { chars - lines } else { 0 };
        WordCount {
            bytes,
            chars,
            lines,
            words,
            max_line_length,
        }
    }
    /// Count the UTF-8 characters and words in the given string slice.
    ///
    /// `s` is a string slice that is assumed to be a UTF-8 string.
    fn utf8_word_and_char_count(s: &str) -> (usize, usize) {
        let word_count = s.split_whitespace().count();
        let char_count = s.chars().count();
        (word_count, char_count)
    }
    pub fn with_title(self, title: &str) -> TitledWordCount {
        TitledWordCount { title, count: self }
    }
    /// Count the characters and words in the given slice of bytes.
    ///
    /// `line` is a slice of bytes that will be decoded as UTF-8
    /// characters, or if that fails, as ASCII characters.
    fn word_and_char_count(line: &[u8]) -> (usize, usize) {
        // try and convert the bytes to UTF-8 first
        match from_utf8(line) {
            Ok(s) => WordCount::utf8_word_and_char_count(s),
            Err(..) => WordCount::ascii_word_and_char_count(line),
        }
    }
 }
 /// This struct supplements the actual word count with a title that is displayed